def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency """ chars = ["".join([w[0] for w in s]) for s in sentences] dico = create_dico(chars) char_to_id, id_to_char = create_mapping(dico) print("Found %i unique characters" % len(dico)) return dico, char_to_id, id_to_char
def word_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(words) dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words in (%i in total)" % (len(dico), sum(len(x) for x in words))) return dico, word_to_id, id_to_word
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ def f(x): return [1] if len(x) < 3 else range(1, len(x)) tags = [] for s in sentences: for word in s: tags.append([word[j] for j in f(word)]) dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) return dico, tag_to_id, id_to_tag
def entity_mapping(sentences): """ Create a dictionary and a mapping of entities, sorted by frequency. """ def f(x): return [1] if len(x) < 3 else range(1, len(x)) tags = [] for s in sentences: for word in s: tags.append([word[j] for j in f(word)]) dico = create_dico(tags) dico = dict((k.split('-')[1], v) for k, v in dico.items() if k.split('-')[0] == 'B') print("Found %i unique named entites tags" % len(dico)) entity_to_id, id_to_entity = create_mapping(dico) return dico, entity_to_id, id_to_entity