def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = ["".join([w[0] for w in s]) for s in sentences] dico = create_dico(chars) dico['<PAD>'] = 10000000 # dico[';'] = 0 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique characters" % len(dico)) return dico, char_to_id, id_to_char
def augment_with_pretrained(dictionary, ext_emb_path, words): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) #Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) pretrained = [] for line in codecs.open(ext_emb_path, 'r', 'utf-8'): if len(ext_emb_path) > 0: try: pretrained.append(line.rstrip().split()[0].strip()) except IndexError: continue pretrained = set(pretrained) for word in words: if word not in dictionary and any( x in pretrained for x in [word, word.lower(), re.sub('\d', '0', word.lower())]): dictionary[ word] = 0 #add the word from dev & test pretrained embedding with 0 freq # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding #JT: commented_below : as adding all words from embedding throws CUDA runtime errors # if words is None: # for word in pretrained: # if word not in dictionary: # dictionary[word] = 0 #add the word from pretrained embedding with 0 freq # else: # for word in words: # if any(x in pretrained for x in [ # word, # word.lower(), # re.sub('\d', '0', word.lower()) # ]) and word not in dictionary: # dictionary[word] = 0 #add the word from pretrained embedding with 0 freq word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[-1] for word in s] for s in sentences] dico = create_dico(tags) dico[model.START_TAG] = -1 dico[model.STOP_TAG] = -2 tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) # print(dico) return dico, tag_to_id, id_to_tag
def word_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(words) #dict with word frequency # print(dico) dico['<PAD>'] = 10000001 dico['<UNK>'] = 10000000 dico = {k:v for k,v in dico.items() if v>=3} #prune words which has occureced less than 3 times word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words (%i in total)" % ( len(dico), sum(len(x) for x in words) )) return dico, word_to_id, id_to_word