def flush_entity_universe(self): print("len(self.entities_universe) =", len(self.entities_universe)) entities_folder = config.base_folder + "data/entities/extension_entities/" _, wiki_id_name_map = load_wiki_name_id_map() if not os.path.exists(entities_folder): os.makedirs(entities_folder) def dump_entities(entity_set, name): with open(entities_folder + name + ".pickle", 'wb') as handle: pickle.dump(entity_set, handle) with open(entities_folder + name + ".txt", "w") as fout: for ent_id in entity_set: fout.write(ent_id + "\t" + wiki_id_name_map[ent_id].replace(' ', '_') + "\n") dump_entities(self.entities_universe, "entities_universe") # now calculate the expansion i.e. from this universe omit the ones that we have already trained extension_entity_set = set() wikiid2nnid = load_wikiid2nnid() for wikiid in self.entities_universe: if wikiid not in wikiid2nnid: extension_entity_set.add(wikiid) print("len(extension_entity_set) =", len(extension_entity_set)) dump_entities(extension_entity_set, "extension_entities")
def add_embeddings_op(self): """Defines self.word_embeddings""" b_size = tf.shape(self.cand_entities_ids)[0] cand_spans = tf.shape(self.cand_entities_ids)[1] cand_ents = tf.shape(self.cand_entities_ids)[2] entities = tf.reshape(self.cand_entities_ids, [b_size, cand_spans, cand_ents // 22, 22]) entities = tf.reshape(entities, [-1, 22]) zeros_count = tf.reduce_sum(tf.cast(tf.equal(entities, 0), tf.int32), axis=1) lengths = tf.math.maximum(0, 20 - zeros_count) with tf.variable_scope('bilm_1'): entitites_embeddings_op = self.entity_bilm(entities) # [batch_size, max_token] with tf.variable_scope('bilm_2'): words_embeddings_op = self.bilm(self.words) with tf.variable_scope("words"): self.word_embeddings = weight_layers('words', words_embeddings_op, l2_coef=0.0)['weighted_op'] print("word_embeddings (after lookup) ", self.word_embeddings) with tf.variable_scope("entities"): from preprocessing.util import load_wikiid2nnid self.nentities = len(load_wikiid2nnid(extension_name=self.args.entity_extension)) self.entity_embeddings = tf.reshape(weight_layers( 'entities', entitites_embeddings_op, l2_coef=0.0 )['weighted_op'], [b_size, cand_spans, cand_ents // 22, 20, -1]) # [batch_size, max_token, vdim] #cell_fw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_lstm // 2) #cell_bw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_lstm // 2) #(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( # cell_fw, cell_bw, output, # sequence_length=lengths, dtype=tf.float32) #output = tf.concat([output_fw, output_bw], axis=-1) #output = tf.concat([output[:, 0, :], output[:, -1, :]], axis=-1) # coeffs = tf.nn.softmax(tf.squeeze(tf.layers.dense(output, 1))) # output = tf.reduce_sum(output * coeffs[..., None], 1) # self.entity_embeddings = tf.layers.dense(tf.reshape(output, [b_size, cand_spans, cand_ents // 22, 256]), 300) #mask = tf.math.logical_not(tf.equal(entities, 0)[:, 1:-1]) #Q = tf.layers.dense(output, self.args.hidden_size_lstm) # [batch_size, sequence_length, hidden_dim] #K = tf.layers.dense(output, self.args.hidden_size_lstm) # [batch_size, sequence_length, hidden_dim] #V = tf.layers.dense(output, 300) # [batch_size, sequence_length, n_classes] #query_value_attention_seq = tf.keras.layers.Attention()([Q, V, K], [mask, mask]) #query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(query_value_attention_seq) #self.entity_embeddings = tf.reshape(query_value_attention, [b_size, cand_spans, cand_ents // 22, -1]) # self.entity_embeddings = util.ffnn(self.entity_embeddings, 1, 300, 300, dropout=None) self.pure_entity_embeddings = self.entity_embeddings if self.args.ent_vecs_regularization.startswith("l2"): # 'l2' or 'l2dropout' self.entity_embeddings = tf.nn.l2_normalize(self.entity_embeddings, dim=3) # not necessary since i do normalization in the entity embed creation as well, just for safety if self.args.ent_vecs_regularization == "dropout" or \ self.args.ent_vecs_regularization == "l2dropout": self.entity_embeddings = tf.nn.dropout(self.entity_embeddings, self.dropout)
def __init__(self, output_folder, predictions_folder, entity_extension=None): self.thr = None self.output_folder = output_folder self.predictions_folder = predictions_folder with open(output_folder+"word_char_maps.pickle", 'rb') as handle: _, self.id2word, _, self.id2char, _, _ = pickle.load(handle) self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension), unique_values=True) _, self.wiki_id_name_map = load_wiki_name_id_map() self.extra_info = ""
def __init__(self, output_folder, predictions_folder, entity_extension=None, gm_bucketing_pempos=None, print_global_voters=False, print_global_pairwise_scores=False): self.thr = None self.output_folder = output_folder self.predictions_folder = predictions_folder with open(output_folder + "word_char_maps.pickle", 'rb') as handle: _, self.id2word, _, self.id2char, _, _ = pickle.load(handle) self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension), unique_values=True) _, self.wiki_id_name_map = load_wiki_name_id_map() self.extra_info = "" self.gm_bucketing = GMBucketingResults( gm_bucketing_pempos) if gm_bucketing_pempos else None self.print_global_pairwise_scores = print_global_pairwise_scores self.print_global_voters = print_global_voters
def add_embeddings_op(self): """Defines self.word_embeddings""" with tf.variable_scope("words"): self.word_embeddings = self.bert( inputs=self.bert_inputs, as_dict=True, signature="tokens")["sequence_output"][:, 1:-1, ...] print("word_embeddings (after lookup) ", self.word_embeddings) with tf.variable_scope("entities"): from preprocessing.util import load_wikiid2nnid self.nentities = len( load_wikiid2nnid(extension_name=self.args.entity_extension)) _entity_embeddings = tf.Variable(tf.constant( 0.0, shape=[self.nentities, 300]), name="_entity_embeddings", dtype=tf.float32, trainable=True) self.entity_embeddings_placeholder = tf.placeholder( tf.float32, [self.nentities, 300]) self.entity_embedding_init = _entity_embeddings.assign( self.entity_embeddings_placeholder) self.entity_embeddings = tf.nn.embedding_lookup( _entity_embeddings, self.cand_entities, name="entity_embeddings") # self.entity_embeddings = util.ffnn(self.entity_embeddings, 1, 300, 300, dropout=None) self.pure_entity_embeddings = self.entity_embeddings if self.args.ent_vecs_regularization.startswith( "l2"): # 'l2' or 'l2dropout' self.entity_embeddings = tf.nn.l2_normalize( self.entity_embeddings, dim=3) # not necessary since i do normalization in the entity embed creation as well, just for safety if self.args.ent_vecs_regularization == "dropout" or \ self.args.ent_vecs_regularization == "l2dropout": self.entity_embeddings = tf.nn.dropout(self.entity_embeddings, self.dropout)
def add_embeddings_op(self): """Defines self.word_embeddings""" with tf.variable_scope("words"): _word_embeddings = tf.Variable(tf.constant( 0.0, shape=[self.nwords, 300]), name="_word_embeddings", dtype=tf.float32, trainable=False) self.word_embeddings_placeholder = tf.placeholder( tf.float32, [self.nwords, 300]) self.word_embedding_init = _word_embeddings.assign( self.word_embeddings_placeholder) word_embeddings = tf.nn.embedding_lookup(_word_embeddings, self.words, name="word_embeddings") self.pure_word_embeddings = word_embeddings #print("word_embeddings (after lookup) ", word_embeddings) with tf.variable_scope("chars"): if self.args.use_chars: # get char embeddings matrix _char_embeddings = tf.get_variable( name="_char_embeddings", dtype=tf.float32, shape=[self.nchars, self.args.dim_char], trainable=True) char_embeddings = tf.nn.embedding_lookup( _char_embeddings, self.chars, name="char_embeddings") # char_embeddings: tf.float32, shape=[None, None, None, dim_char], # shape = (batch size, max length of sentence, max length of word, dim_char) # put the time dimension on axis=1 s = tf.shape(char_embeddings) char_embeddings = tf.reshape( char_embeddings, shape=[s[0] * s[1], s[-2], self.args.dim_char]) # (batch*sent_length, characters of word, dim_char) char_lengths = tf.reshape(self.chars_len, shape=[s[0] * s[1]]) # shape = (batch_size*max_length of sentence) # bi lstm on chars cell_fw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_char, state_is_tuple=True) cell_bw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_char, state_is_tuple=True) _output = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, char_embeddings, sequence_length=char_lengths, dtype=tf.float32) # read and concat output _, ((_, output_fw), (_, output_bw)) = _output output = tf.concat([output_fw, output_bw], axis=-1) # shape = (batch size, max sentence length, char hidden size) output = tf.reshape( output, shape=[s[0], s[1], 2 * self.args.hidden_size_char]) #print("output after char lstm ", output) word_embeddings = tf.concat( [word_embeddings, output], axis=-1) # concatenate word and char embeddings #print("word_embeddings with char after concatenation ", word_embeddings) # (batch, words, 300+2*100) self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout) with tf.variable_scope("entities"): from preprocessing.util import load_wikiid2nnid self.nentities = len( load_wikiid2nnid(extension_name=self.args.entity_extension)) _entity_embeddings = tf.Variable( tf.constant(0.0, shape=[self.nentities, 300]), name="_entity_embeddings", dtype=tf.float32, trainable=self.args.train_ent_vecs) self.entity_embeddings_placeholder = tf.placeholder( tf.float32, [self.nentities, 300]) self.entity_embedding_init = _entity_embeddings.assign( self.entity_embeddings_placeholder) self.entity_embeddings = tf.nn.embedding_lookup( _entity_embeddings, self.cand_entities, name="entity_embeddings") self.pure_entity_embeddings = self.entity_embeddings if self.args.ent_vecs_regularization.startswith( "l2"): # 'l2' or 'l2dropout' self.entity_embeddings = tf.nn.l2_normalize( self.entity_embeddings, dim=3) # not necessary since i do normalization in the entity embed creation as well, just for safety if self.args.ent_vecs_regularization == "dropout" or \ self.args.ent_vecs_regularization == "l2dropout": self.entity_embeddings = tf.nn.dropout(self.entity_embeddings, self.dropout)
def __init__(self): self._generator = SamplesGenerator() self._word2id, self._char2id = build_word_char_maps() #self._word2id, self._char2id = build_word_char_maps_restore() # alternative self._wikiid2nnid = util.load_wikiid2nnid(args.entity_extension)
def __init__(self, train_args, args): self.args = args # input pipeline self.streaming_samples = StreamingSamples() ds = tf.data.Dataset.from_generator( self.streaming_samples.gen, ( tf.int64, tf.int64, tf.int64, tf.int64, #words, words_len, chars, chars_len tf.int64, tf.int64, tf.int64, # begin_span, end_span, span_len tf.int64, tf.float32, tf.int64 ), #cand_entities, cand_entities_scores, cand_entities_len (tf.TensorShape([None]), tf.TensorShape( []), tf.TensorShape([None, None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape( []), tf.TensorShape([None, None]), tf.TensorShape( [None, None]), tf.TensorShape([None]))) next_element = ds.make_one_shot_iterator().get_next() # batch size = 1 i expand the dims now to match the training that has batch dimension next_element = [tf.expand_dims(t, 0) for t in next_element] next_element = [ None, *next_element[:-1], None, next_element[-1], None, None, None, None ] # restore model print("loading Model:", train_args.output_folder) model = Model(train_args, next_element) model.build() checkpoint_path = model.restore_session("el" if args.el_mode else "ed") self.model = model if args.hardcoded_thr: self.thr = args.hardcoded_thr print("threshold used:", self.thr) else: # optimal threshold recovery from log files. # based on the checkpoint selected look at the log file for threshold (otherwise recompute it) self.thr = retrieve_optimal_threshold_from_logfile( train_args.output_folder, checkpoint_path, args.el_mode) print("optimal threshold selected = ", self.thr) if args.running_mode == "el_mode": args.el_mode = True elif args.running_mode == "ed_mode": args.el_mode = False # convert text to tensors for the NN with open(args.experiment_folder + "word_char_maps.pickle", 'rb') as handle: self.word2id, _, self.char2id, _, _, _ = pickle.load(handle) self.wikiid2nnid = load_wikiid2nnid( extension_name=args.entity_extension) self.nnid2wikiid = reverse_dict(self.wikiid2nnid, unique_values=True) _, self.wiki_id_name_map = load_wiki_name_id_map() with open(args.experiment_folder + "prepro_args.pickle", 'rb') as handle: self.prepro_args = pickle.load(handle) if args.lowercase_spans_pem: self.prepro_args.lowercase_p_e_m = True self.prepro_args.lowercase_spans = True print("prepro_args:", self.prepro_args) self.prepro_args.persons_coreference = args.persons_coreference self.prepro_args.persons_coreference_merge = args.persons_coreference_merge self.fetchFilteredCoreferencedCandEntities = FetchFilteredCoreferencedCandEntities( self.prepro_args) prepro_util.args = self.prepro_args self.special_tokenized_words = {"``", '"', "''"} self.special_words_assertion_errors = 0 self.gm_idx_errors = 0 if self.args.el_with_stanfordner_and_our_ed: from nltk.tag import StanfordNERTagger self.st = StanfordNERTagger( '../data/stanford_core_nlp/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz', '../data/stanford_core_nlp/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8') self.from_myspans_to_given_spans_map_errors = 0
def __init__(self): self._generator = SamplesGenerator() self._batcher = TokenBatcher(config.base_folder+"data/vocabulary/"+"vocab_2.txt") self._wikiid2nnid = util.load_wikiid2nnid(args.entity_extension) self._wikii2summary = util.load_entity_summary_map()
def __init__(self): self._generator = SamplesGenerator() self._batcher = create_tokenizer_from_hub_module() self._wikiid2nnid = util.load_wikiid2nnid(args.entity_extension) self._wikii2summary = util.load_entity_summary_map()