class FeatureExtractor(object): def __init__(self, model, options, vocab, nnvecs=1): self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab self.model = model self.nnvecs = nnvecs # Load ELMo if the option is set if options.elmo is not None: from elmo import ELMo self.elmo = ELMo(options.elmo, options.elmo_gamma, options.elmo_learn_gamma) self.elmo.init_weights(model) else: self.elmo = None extra_words = 2 # MLP padding vector and OOV vector self.words = {word: ind for ind, word in enumerate(words, extra_words)} self.word_lookup = self.model.add_lookup_parameters( (len(self.words) + extra_words, options.word_emb_size)) extra_pos = 2 # MLP padding vector and OOV vector self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)} self.pos_lookup = self.model.add_lookup_parameters( (len(cpos) + extra_pos, options.pos_emb_size)) self.irels = rels self.rels = {rel: ind for ind, rel in enumerate(rels)} extra_chars = 1 # OOV vector self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)} self.char_lookup = self.model.add_lookup_parameters( (len(chars) + extra_chars, options.char_emb_size)) extra_treebanks = 1 # Padding vector self.treebanks = { treebank: ind for ind, treebank in enumerate(treebanks, extra_treebanks) } self.treebank_lookup = self.model.add_lookup_parameters( (len(treebanks) + extra_treebanks, options.tbank_emb_size)) # initialise word vectors with external embeddings where they exist # This part got ugly - TODO: refactor if not options.predict: self.external_embedding = defaultdict(lambda: {}) if options.ext_word_emb_file and options.word_emb_size > 0: # Load pre-trained word embeddings for lang in langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_word_emb_file, lang=lang, words=self.words.viewkeys()) self.external_embedding["words"].update(embeddings) if options.ext_char_emb_file and options.char_emb_size > 0: # Load pre-trained character embeddings for lang in langs: embeddings = utils.get_external_embeddings( options, emb_file=options.ext_char_emb_file, lang=lang, words=self.chars, chars=True) self.external_embedding["chars"].update(embeddings) if options.ext_emb_dir: # For every language, load the data for the word and character # embeddings from a directory. for lang in langs: if options.word_emb_size > 0: embeddings = utils.get_external_embeddings( options, emb_dir=options.ext_emb_dir, lang=lang, words=self.words.viewkeys()) self.external_embedding["words"].update(embeddings) if options.char_emb_size > 0: embeddings = utils.get_external_embeddings( options, emb_dir=options.ext_emb_dir, lang=lang, words=self.chars, chars=True) self.external_embedding["chars"].update(embeddings) self.init_lookups(options) elmo_emb_size = self.elmo.emb_dim if self.elmo else 0 self.lstm_input_size = ( options.word_emb_size + elmo_emb_size + options.pos_emb_size + options.tbank_emb_size + 2 * (options.char_lstm_output_size if options.char_emb_size > 0 else 0) ) print "Word-level LSTM input size: " + str(self.lstm_input_size) self.bilstms = [] if options.no_bilstms > 0: self.bilstms.append( BiLSTM(self.lstm_input_size, options.lstm_output_size, self.model, dropout_rate=0.33)) for i in range(1, options.no_bilstms): self.bilstms.append( BiLSTM(2 * options.lstm_output_size, options.lstm_output_size, self.model, dropout_rate=0.33)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (options.lstm_output_size * 2, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (options.lstm_output_size * 2)) else: self.word2lstm = self.model.add_parameters( (self.lstm_input_size, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_input_size)) self.char_bilstm = BiLSTM(options.char_emb_size, options.char_lstm_output_size, self.model, dropout_rate=0.33) self.charPadding = self.model.add_parameters( (options.char_lstm_output_size * 2)) def Init(self, options): paddingWordVec = self.word_lookup[ 1] if options.word_emb_size > 0 else None paddingElmoVec = dy.zeros(self.elmo.emb_dim) if self.elmo else None paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None paddingCharVec = self.charPadding.expr( ) if options.char_emb_size > 0 else None paddingTbankVec = self.treebank_lookup[ 0] if options.tbank_emb_size > 0 else None self.paddingVec = dy.tanh(self.word2lstm.expr() *\ dy.concatenate(filter(None,[paddingWordVec, paddingElmoVec, paddingPosVec, paddingCharVec, paddingTbankVec])) + self.word2lstmbias.expr()) self.empty = self.paddingVec if self.nnvecs == 1 else\ dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda: {})): if self.elmo: # Get full text of sentence - excluding root, which is loaded differently # for transition and graph-based parsers. if options.graph_based: sentence_text = " ".join( [entry.form for entry in sentence[1:]]) else: sentence_text = " ".join( [entry.form for entry in sentence[:-1]]) elmo_sentence_representation = \ self.elmo.get_sentence_representation(sentence_text) for i, root in enumerate(sentence): root.vecs = defaultdict( lambda: None ) # all vecs are None by default (possibly a little risky?) if options.word_emb_size > 0: if train: word_count = float(self.word_counts.get(root.norm, 0)) dropFlag = random.random() > word_count / (0.25 + word_count) root.vecs["word"] = self.word_lookup[ self.words.get(root.norm, 0) if not dropFlag else 0] else: # need to check in test_embeddings at prediction time if root.norm in self.words: root.vecs["word"] = self.word_lookup[self.words[ root.norm]] elif root.norm in test_embeddings["words"]: root.vecs["word"] = dy.inputVector( test_embeddings["words"][root.norm]) else: root.vecs["word"] = self.word_lookup[0] if options.pos_emb_size > 0: root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)] if options.char_emb_size > 0: root.vecs["char"] = self.get_char_vector( root, train, test_embeddings["chars"]) if options.tbank_emb_size > 0: if options.forced_tbank_emb: treebank_id = options.forced_tbank_emb elif root.proxy_tbank: treebank_id = root.proxy_tbank else: treebank_id = root.treebank_id # this is a bit of a hack for models trained on an old version of the code # that used treebank name rather than id as the lookup if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \ utils.reverse_iso_dict[treebank_id] in self.treebanks: treebank_id = utils.reverse_iso_dict[treebank_id] root.vecs["treebank"] = self.treebank_lookup[ self.treebanks[treebank_id]] if self.elmo: if i < len(sentence) - 1: # Don't look up the 'root' word root.vecs["elmo"] = elmo_sentence_representation[i] else: # TODO root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim) root.vec = dy.concatenate( filter(None, [ root.vecs["word"], root.vecs["elmo"], root.vecs["pos"], root.vecs["char"], root.vecs["treebank"] ])) for bilstm in self.bilstms: bilstm.set_token_vecs(sentence, train) def get_char_vector(self, root, train, test_embeddings_chars={}): if root.char_rep == "*root*": # no point running a character analysis over this placeholder token return self.charPadding.expr( ) # use the padding vector if it's the root token else: char_vecs = [] for char in root.char_rep: if char in self.chars: char_vecs.append(self.char_lookup[self.chars[char]]) elif char in test_embeddings_chars: char_vecs.append( dy.inputVector(test_embeddings_chars[char])) else: char_vecs.append(self.char_lookup[0]) return self.char_bilstm.get_sequence_vector(char_vecs, train) def init_lookups(self, options): if self.external_embedding["words"]: print 'Initialising %i word vectors with external embeddings' % len( self.external_embedding["words"]) for word in self.external_embedding["words"]: if len(self.external_embedding["words"] [word]) != options.word_emb_size: raise Exception( "Size of external embedding does not match specified word embedding size of %s" % (options.word_emb_size)) self.word_lookup.init_row( self.words[word], self.external_embedding["words"][word]) elif options.word_emb_size > 0: print 'No word external embeddings found: all vectors initialised randomly' if self.external_embedding["chars"]: print 'Initialising %i char vectors with external embeddings' % len( self.external_embedding["chars"]) for char in self.external_embedding["chars"]: if len(self.external_embedding["chars"] [char]) != options.char_emb_size: raise Exception( "Size of external embedding does not match specified char embedding size of %s" % (options.char_emb_size)) self.char_lookup.init_row( self.chars[char], self.external_embedding["chars"][char]) elif options.char_emb_size > 0: print 'No character external embeddings found: all vectors initialised randomly'
class FeatureExtractor(object): def __init__(self, model, wordsCount, rels, langs, words, ch, nnvecs, options): """ Options handling """ self.model = model if langs: self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector else: self.langs = None self.nnvecs = nnvecs self.multiling = options.multiling #and options.use_lembed self.external_embedding = None if options.external_embedding is not None: self.get_external_embeddings(options.external_embedding, model, wordsCount) self.disable_bilstm = options.disable_bilstm self.disable_second_bilstm = options.disable_second_bilstm """sharing""" self.shareBiLSTM = options.shareBiLSTM self.shareWordLookup = options.shareWordLookup self.shareCharLookup = options.shareCharLookup self.shareCharBiLSTM = options.shareCharBiLSTM self.word_lembed = options.lembed_word self.char_lembed = options.lembed_char """dims""" self.word_emb_size = options.word_emb_size self.char_emb_size = options.char_emb_size self.lstm_output_size = options.lstm_output_size self.char_lstm_output_size = options.char_lstm_output_size self.lang_emb_size = options.lang_emb_size lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ not None else 0) + (self.lang_emb_size if self.word_lembed else 0)\ + 2 * self.char_lstm_output_size """UTILS""" self.wordsCount = wordsCount self.irels = rels if self.multiling and not self.shareWordLookup: w2i = {} for lang in self.langs: w2i[lang] = {w: i for i, w in enumerate(words[lang])} self.vocab = {} for lang in self.langs: self.vocab[lang] = {word: ind+2 for word, ind in w2i[lang].iteritems()} else: w2i = {w: i for i, w in enumerate(words)} self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector if not self.multiling or self.shareCharLookup: self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector else: self.chars = {} for lang in self.langs: self.chars[lang] = {char: ind+1 for ind, char in enumerate(ch[lang])} self.rels = {word: ind for ind, word in enumerate(rels)} """BILSTMS""" #word if not self.multiling or self.shareBiLSTM: if not self.disable_bilstm: self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, model, dropout_rate=0.33) if not self.disable_second_bilstm: self.bilstm2 = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model, dropout_rate=0.33) else: self.lstm_output_size = int(lstm_input_size * 0.5) else: self.bilstm1= {} self.bilstm2= {} for lang in self.langs: self.bilstm1[lang] = BiLSTM(lstm_input_size, self.lstm_output_size, model, dropout_rate=0.33) self.bilstm2[lang] = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model, dropout_rate=0.33) #char if self.char_lembed: char_in_dims = self.char_emb_size + self.lang_emb_size else: char_in_dims = self.char_emb_size if not self.multiling or self.shareCharBiLSTM: self.char_bilstm = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33) else: self.char_bilstms = {} for lang in self.langs: self.char_bilstms[lang] = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33) """LOOKUPS""" if not self.multiling or self.shareCharLookup: self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size)) else: self.clookups = {} for lang in self.langs: self.clookups[lang] = self.model.add_lookup_parameters((len(ch[lang]) + 1, self.char_emb_size)) if not self.multiling or self.shareWordLookup: self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size)) else: self.wlookups = {} for lang in self.langs: self.wlookups[lang] = self.model.add_lookup_parameters((len(words[lang]) + 2, self.word_emb_size)) if self.multiling and self.lang_emb_size > 0: self.langslookup = model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size)) """Padding""" self.word2lstm = model.add_parameters((self.lstm_output_size * 2, lstm_input_size)) self.word2lstmbias = model.add_parameters((self.lstm_output_size *2)) self.chPadding = model.add_parameters((self.char_lstm_output_size *2)) def get_char_vec(self,word,dropout,lang=None,langvec=None): if word.form == "*root*": word.chVec = self.chPadding.expr() # use the padding vector if it's the word token else: char_vecs = [] for char in word.form: if lang: cvec = self.clookups[lang][self.chars[lang].get(char,0)] else: cvec = self.clookup[self.chars.get(char,0)] if langvec: char_vecs.append(dy.concatenate([langvec,cvec])) else: char_vecs.append(cvec) if lang: word.chVec = self.char_bilstms[lang].get_sequence_vector(char_vecs,dropout) else: word.chVec = self.char_bilstm.get_sequence_vector(char_vecs,dropout) def Init(self): #TODO: This function makes me cry #I'm not sure how necessary it is to get different padding vecs evec = self.elookup[1] if self.external_embedding is not None else None paddingLangVec = self.langslookup[0] if self.multiling and self.lang_emb_size > 0 else None if not self.multiling or self.shareWordLookup: paddingWordVec = self.wlookup[1] #import ipdb;ipdb.set_trace() self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None, [paddingWordVec, evec, self.chPadding.expr(), paddingLangVec if self.word_lembed else None])) + self.word2lstmbias.expr() ) self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) else: paddingWordVecs = {} self.paddingVecs = {} self.emptyVecs = {} for lang in self.langs: paddingWordVecs[lang] = self.wlookups[lang][1] self.paddingVecs[lang] = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None, [paddingWordVecs[lang], evec, self.chPadding.expr(), paddingLangVec if self.word_lembed else None])) + self.word2lstmbias.expr() ) self.emptyVecs[lang] = self.paddingVecs[lang] if self.nnvecs == 1 else dy.concatenate([self.paddingVecs[lang] for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train, get_vectors=False): lang = sentence[0].language_id for root in sentence: #word if not self.multiling or self.shareWordLookup: wordcount = float(self.wordsCount.get(root.norm, 0)) else: wordcount = float(self.wordsCount[lang].get(root.norm, 0)) noDropFlag = not train or (random.random() < (wordcount/(0.25+wordcount))) if not self.multiling or self.shareWordLookup: root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if noDropFlag else 0] else: root.wordvec = self.wlookups[lang][int(self.vocab[lang].get(root.norm, 0)) if noDropFlag else 0] if self.multiling and self.word_lembed: root.langvec = self.langslookup[self.langs[root.language_id]] if self.lang_emb_size > 0 else None else: root.langvec = None #char if not self.multiling or self.shareCharBiLSTM: if self.char_lembed: langVec = self.langslookup[self.langs[lang]] self.get_char_vec(root,train, langvec=langvec) else: self.get_char_vec(root,train) else: self.get_char_vec(root,train, lang=lang) if self.external_embedding is not None: if not noDropFlag and random.random() < 0.5: root.evec = self.elookup[0] elif root.form in self.external_embedding: root.evec = self.elookup[self.extrnd[root.form]] elif root.norm in self.external_embedding: root.evec = self.elookup[self.extrnd[root.norm]] else: root.evec = self.elookup[0] else: root.evec = None root.vec = dy.concatenate(filter(None, [root.wordvec, root.evec, root.chVec, root.langvec])) if not self.multiling or self.shareBiLSTM: self.bilstm1.set_token_vecs(sentence,train) self.bilstm2.set_token_vecs(sentence,train) else: self.bilstm1[lang].set_token_vecs(sentence,train) self.bilstm2[lang].set_token_vecs(sentence,train) if get_vectors: data_vec = list() for i, token in enumerate(sentence): if token.form != '*root*': import pdb wordvec = token.wordvec.value() if self.external_embedding is not None: wordvec += token.evec.value() data_tuple = (i+1, token.form, token.cpos, token.feats, token.chVec.value(), wordvec, token.vec.value()) data_vec.append(data_tuple) return data_vec def get_external_embeddings(self, external_embedding_file, model, wordsCount): # NOTE: this is modified to load fastText embeddings! self.external_embedding = {} external_embedding_fp = codecs.open(external_embedding_file, 'r', encoding='utf-8') # read first line --- number of tokens and embedding dimension self.edim = int(external_embedding_fp.readline().split()[1]) num_tokens = 0 for line in external_embedding_fp: line = line.strip().split() if len(line) != self.edim + 1: continue else: if line[0] in wordsCount: self.external_embedding[line[0]] = [float(f) for f in line[1:]] num_tokens += 1 external_embedding_fp.close() # self.edim = len(self.external_embedding.values()[0]) self.noextrn = [0.0 for _ in xrange(self.edim)] #??? self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)} self.elookup = model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim)) for word, i in self.extrnd.iteritems(): self.elookup.init_row(i, self.external_embedding[word]) self.extrnd['*PAD*'] = 1 self.extrnd['*INITIAL*'] = 2 print '-' * 100 print 'Load external embedding. Vector dimensions:', self.edim, ', number of tokens:', num_tokens print '-' * 100
class FeatureExtractor(object): def __init__(self, model, options, vocab, nnvecs): self.word_counts, words, chars, pos, cpos, self.irels, treebanks, langs = vocab self.model = model self.nnvecs = nnvecs extra_words = 2 # MLP padding vector and OOV vector self.words = {word: ind for ind, word in enumerate(words, extra_words)} self.word_lookup = self.model.add_lookup_parameters( (len(self.words) + extra_words, options.word_emb_size)) extra_pos = 2 # MLP padding vector and OOV vector self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)} self.pos_lookup = self.model.add_lookup_parameters( (len(cpos) + extra_pos, options.pos_emb_size)) extra_chars = 1 # OOV vector self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)} self.char_lookup = self.model.add_lookup_parameters( (len(chars) + extra_chars, options.char_emb_size)) extra_treebanks = 1 # Padding vector self.treebanks = { treebank: ind for ind, treebank in enumerate(treebanks, extra_treebanks) } self.treebank_lookup = self.model.add_lookup_parameters( (len(treebanks) + extra_treebanks, options.tbank_emb_size)) # initialise word vectors with external embeddings where they exist if (options.ext_emb_dir or options.ext_emb_file) and not options.predict: self.external_embedding = defaultdict(lambda: {}) for lang in langs: if options.word_emb_size > 0: self.external_embedding["words"].update( utils.get_external_embeddings(options, lang, self.words.viewkeys())) if options.char_emb_size > 0: self.external_embedding["chars"].update( utils.get_external_embeddings(options, lang, self.chars, chars=True)) self.init_lookups(options) self.lstm_input_size = options.word_emb_size + options.pos_emb_size + options.tbank_emb_size +\ 2* (options.char_lstm_output_size if options.char_emb_size > 0 else 0) print "Word-level LSTM input size: " + str(self.lstm_input_size) self.bilstms = [] if options.no_bilstms > 0: self.bilstms.append( BiLSTM(self.lstm_input_size, options.lstm_output_size, self.model, dropout_rate=0.33)) for i in range(1, options.no_bilstms): self.bilstms.append( BiLSTM(2 * options.lstm_output_size, options.lstm_output_size, self.model, dropout_rate=0.33)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (options.lstm_output_size * 2, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (options.lstm_output_size * 2)) else: self.word2lstm = self.model.add_parameters( (self.lstm_input_size, self.lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_input_size)) self.char_bilstm = BiLSTM(options.char_emb_size, options.char_lstm_output_size, self.model, dropout_rate=0.33) self.charPadding = self.model.add_parameters( (options.char_lstm_output_size * 2)) def Init(self, options): paddingWordVec = self.word_lookup[ 1] if options.word_emb_size > 0 else None paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None paddingCharVec = self.charPadding.expr( ) if options.char_emb_size > 0 else None paddingTbankVec = self.treebank_lookup[ 0] if options.tbank_emb_size > 0 else None self.paddingVec = dy.tanh(self.word2lstm.expr() *\ dy.concatenate(filter(None,[paddingWordVec, paddingPosVec, paddingCharVec, paddingTbankVec])) + self.word2lstmbias.expr()) self.empty = self.paddingVec if self.nnvecs == 1 else\ dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train, options, test_embeddings=defaultdict(lambda: {})): for root in sentence: root.vecs = defaultdict( lambda: None ) # all vecs are None by default (possibly a little risky?) if options.word_emb_size > 0: if train: word_count = float(self.word_counts.get(root.norm, 0)) dropFlag = random.random() > word_count / (0.25 + word_count) root.vecs["word"] = self.word_lookup[ self.words.get(root.norm, 0) if not dropFlag else 0] else: # need to check in test_embeddings at prediction time if root.norm in self.words: root.vecs["word"] = self.word_lookup[self.words[ root.norm]] elif root.norm in test_embeddings["words"]: root.vecs["word"] = dy.inputVector( test_embeddings["words"][root.norm]) else: root.vecs["word"] = self.word_lookup[0] if options.pos_emb_size > 0: root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)] if options.char_emb_size > 0: root.vecs["char"] = self.get_char_vector( root, train, test_embeddings["chars"]) if options.tbank_emb_size > 0: if options.forced_tbank_emb: treebank_id = options.forced_tbank_emb elif root.proxy_tbank: treebank_id = root.proxy_tbank else: treebank_id = root.treebank_id # this is a bit of a hack for models trained on an old version of the code # that used treebank name rather than id as the lookup if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \ utils.reverse_iso_dict[treebank_id] in self.treebanks: treebank_id = utils.reverse_iso_dict[treebank_id] root.vecs["treebank"] = self.treebank_lookup[ self.treebanks[treebank_id]] root.vec = dy.concatenate( filter(None, [ root.vecs["word"], root.vecs["pos"], root.vecs["char"], root.vecs["treebank"] ])) for bilstm in self.bilstms: bilstm.set_token_vecs(sentence, train) def get_char_vector(self, root, train, test_embeddings_chars={}): if root.char_rep == "*root*": # no point running a character analysis over this placeholder token return self.charPadding.expr( ) # use the padding vector if it's the root token else: char_vecs = [] for char in root.char_rep: if char in self.chars: char_vecs.append(self.char_lookup[self.chars[char]]) elif char in test_embeddings_chars: char_vecs.append( dy.inputVector(test_embeddings_chars[char])) else: char_vecs.append(self.char_lookup[0]) return self.char_bilstm.get_sequence_vector(char_vecs, train) def init_lookups(self, options): if self.external_embedding["words"]: print 'Initialising %i word vectors with external embeddings' % len( self.external_embedding["words"]) for word in self.external_embedding["words"]: if len(self.external_embedding["words"] [word]) != options.word_emb_size: raise Exception( "Size of external embedding does not match specified word embedding size of %s" % (options.word_emb_size)) self.word_lookup.init_row( self.words[word], self.external_embedding["words"][word]) elif options.word_emb_size > 0: print 'No word external embeddings found: all vectors initialised randomly' if self.external_embedding["chars"]: print 'Initialising %i char vectors with external embeddings' % len( self.external_embedding["chars"]) for char in self.external_embedding["chars"]: if len(self.external_embedding["chars"] [char]) != options.char_emb_size: raise Exception( "Size of external embedding does not match specified char embedding size of %s" % (options.char_emb_size)) self.char_lookup.init_row( self.chars[char], self.external_embedding["chars"][char]) elif options.char_emb_size > 0: print 'No character external embeddings found: all vectors initialised randomly'
class FeatureExtractor(object): def __init__(self, model, options, words, rels, langs, w2i, ch, nnvecs): self.model = model self.disableBilstm = options.disable_bilstm self.multiling = options.use_lembed and options.multiling self.lstm_output_size = options.lstm_output_size self.char_lstm_output_size = options.char_lstm_output_size self.word_emb_size = options.word_emb_size self.char_emb_size = options.char_emb_size self.lang_emb_size = options.lang_emb_size self.wordsCount = words self.vocab = {word: ind + 2 for word, ind in w2i.iteritems() } # +2 for MLP padding vector and OOV vector self.chars = {char: ind + 1 for ind, char in enumerate(ch)} # +1 for OOV vector self.rels = {word: ind for ind, word in enumerate(rels)} self.nnvecs = nnvecs if langs: self.langs = {lang: ind + 1 for ind, lang in enumerate(langs) } # +1 for padding vector else: self.langs = None self.irels = rels self.external_embedding = None if options.external_embedding is not None: self.get_external_embeddings(options.external_embedding) lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ not None else 0) + (self.lang_emb_size if self.multiling else 0) + 2 * self.char_lstm_output_size if not self.disableBilstm: self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, self.model, dropout_rate=0.33) self.bilstm2 = BiLSTM(2 * self.lstm_output_size, self.lstm_output_size, self.model, dropout_rate=0.33) else: self.lstm_output_size = int(lstm_input_size * 0.5) self.char_bilstm = BiLSTM(self.char_emb_size, self.char_lstm_output_size, self.model, dropout_rate=0.33) self.clookup = self.model.add_lookup_parameters( (len(ch) + 1, self.char_emb_size)) self.wlookup = self.model.add_lookup_parameters( (len(words) + 2, self.word_emb_size)) if self.multiling and self.lang_emb_size > 0: self.langslookup = self.model.add_lookup_parameters( (len(langs) + 1, self.lang_emb_size)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (self.lstm_output_size * 2, lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_output_size * 2)) self.chPadding = self.model.add_parameters( (self.char_lstm_output_size * 2)) def Init(self): evec = self.elookup[1] if self.external_embedding is not None else None paddingWordVec = self.wlookup[1] paddingLangVec = self.langslookup[ 0] if self.multiling and self.lang_emb_size > 0 else None self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate( filter( None, [paddingWordVec, evec, self.chPadding.expr(), paddingLangVec])) + self.word2lstmbias.expr()) self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate( [self.paddingVec for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train): for root in sentence: wordcount = float(self.wordsCount.get(root.norm, 0)) noDropFlag = not train or (random.random() < (wordcount / (0.25 + wordcount))) root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0) ) if noDropFlag else 0] self.get_char_vector(root, train) if self.external_embedding is not None: if not noDropFlag and random.random() < 0.5: root.evec = self.elookup[0] elif root.form in self.external_embedding: root.evec = self.elookup[self.extrnd[root.form]] elif root.norm in self.external_embedding: root.evec = self.elookup[self.extrnd[root.norm]] else: root.evec = self.elookup[0] else: root.evec = None if self.multiling: root.langvec = self.langslookup[self.langs[ root.language_id]] if self.lang_emb_size > 0 else None else: root.langvec = None root.vec = dy.concatenate( filter(None, [root.wordvec, root.evec, root.chVec, root.langvec])) if not self.disableBilstm: self.bilstm1.set_token_vecs(sentence, train) self.bilstm2.set_token_vecs(sentence, train) def get_char_vector(self, root, train): if root.form == "*root*": # no point running a character analysis over this placeholder token root.chVec = self.chPadding.expr( ) # use the padding vector if it's the root token else: char_vecs = [] for char in root.form: char_vecs.append(self.clookup[self.chars.get(char, 0)]) root.chVec = self.char_bilstm.get_sequence_vector(char_vecs, train) def get_external_embeddings(self, external_embedding_file): external_embedding_fp = codecs.open(external_embedding_file, 'r', encoding='utf-8') external_embedding_fp.readline() self.external_embedding = {} for line in external_embedding_fp: line = line.strip().split() self.external_embedding[line[0]] = [float(f) for f in line[1:]] external_embedding_fp.close() self.edim = len(self.external_embedding.values()[0]) self.noextrn = [0.0 for _ in xrange(self.edim)] #??? self.extrnd = { word: i + 3 for i, word in enumerate(self.external_embedding) } self.elookup = self.model.add_lookup_parameters( (len(self.external_embedding) + 3, self.edim)) for word, i in self.extrnd.iteritems(): self.elookup.init_row(i, self.external_embedding[word]) self.extrnd['*PAD*'] = 1 self.extrnd['*INITIAL*'] = 2 print 'Load external embedding. Vector dimensions', self.edim