Esempio n. 1
0
def augment_with_pretrained(dictionary, ext_emb_path, words):
    """
    Augment the dictionary with words that have a pretrained a embedding.
    If 'words' is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by 'words'
    (typically the words in the development and test sets.)
    """
    assert os.path.isfile(ext_emb_path)
    pretrained = []

    with open(ext_emb_path, 'r', encoding='utf-8') as f:
        print("Pre-trained word embeddings shape: {}".format(
            f.readline().strip()))
        for i, line in enumerate(f):
            pretrained.append(line.strip().split(" ")[0])

    if words is None:
        for word in pretrained:
            if word not in dictionary:
                dictionary[word] = 0
    else:
        for word in words:
            if any(
                    x in pretrained for x in
                [word, word.lower(),
                 re.sub('\d', '0', word.lower())]) and word not in dictionary:
                dictionary[word] = 0

    word_to_id, id_to_word = create_mapping(dictionary)

    return dictionary, word_to_id, id_to_word, pretrained
Esempio n. 2
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    char_to_id, id_to_char = create_mapping(dico)

    print("Found %i unique characters" % len(dico))

    return dico, char_to_id, id_to_char
Esempio n. 3
0
def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico)

    print("Found %i unique words in (%i in total)" %
          (len(dico), sum(len(x) for x in words)))

    return dico, word_to_id, id_to_word
Esempio n. 4
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    def f(x):
        return [1] if len(x) < 3 else range(1, len(x))

    tags = []
    for s in sentences:
        for word in s:
            tags.append([word[j] for j in f(word)])

    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)

    return dico, tag_to_id, id_to_tag
Esempio n. 5
0
def entity_mapping(sentences):
    """
    Create a dictionary and a mapping of entities, sorted by frequency.
    """
    def f(x):
        return [1] if len(x) < 3 else range(1, len(x))

    tags = []
    for s in sentences:
        for word in s:
            tags.append([word[j] for j in f(word)])

    dico = create_dico(tags)
    dico = dict((k.split('-')[1], v) for k, v in dico.items()
                if k.split('-')[0] == 'B')
    print("Found %i unique named entites tags" % len(dico))
    entity_to_id, id_to_entity = create_mapping(dico)

    return dico, entity_to_id, id_to_entity