Example #1
0
    def flush_entity_universe(self):
        print("len(self.entities_universe) =", len(self.entities_universe))
        entities_folder = config.base_folder + "data/entities/extension_entities/"
        _, wiki_id_name_map = load_wiki_name_id_map()
        if not os.path.exists(entities_folder):
            os.makedirs(entities_folder)

        def dump_entities(entity_set, name):
            with open(entities_folder + name + ".pickle", 'wb') as handle:
                pickle.dump(entity_set, handle)
            with open(entities_folder + name + ".txt", "w") as fout:
                for ent_id in entity_set:
                    fout.write(ent_id + "\t" +
                               wiki_id_name_map[ent_id].replace(' ', '_') +
                               "\n")

        dump_entities(self.entities_universe, "entities_universe")
        # now calculate the expansion i.e. from this universe omit the ones that we have already trained
        extension_entity_set = set()
        wikiid2nnid = load_wikiid2nnid()
        for wikiid in self.entities_universe:
            if wikiid not in wikiid2nnid:
                extension_entity_set.add(wikiid)

        print("len(extension_entity_set) =", len(extension_entity_set))
        dump_entities(extension_entity_set, "extension_entities")
Example #2
0
    def add_embeddings_op(self):
        """Defines self.word_embeddings"""
        b_size = tf.shape(self.cand_entities_ids)[0]
        cand_spans = tf.shape(self.cand_entities_ids)[1]
        cand_ents = tf.shape(self.cand_entities_ids)[2]
        entities = tf.reshape(self.cand_entities_ids, [b_size, cand_spans, cand_ents // 22, 22])
        entities = tf.reshape(entities, [-1, 22])
        zeros_count = tf.reduce_sum(tf.cast(tf.equal(entities, 0), tf.int32), axis=1)
        lengths = tf.math.maximum(0, 20 - zeros_count)

        with tf.variable_scope('bilm_1'):
            entitites_embeddings_op = self.entity_bilm(entities) # [batch_size, max_token]
        with tf.variable_scope('bilm_2'):
            words_embeddings_op = self.bilm(self.words)

        with tf.variable_scope("words"):
            self.word_embeddings = weight_layers('words', words_embeddings_op, l2_coef=0.0)['weighted_op']
            print("word_embeddings (after lookup) ", self.word_embeddings)

        with tf.variable_scope("entities"):
            from preprocessing.util import load_wikiid2nnid
            self.nentities = len(load_wikiid2nnid(extension_name=self.args.entity_extension))
            self.entity_embeddings =  tf.reshape(weight_layers(
                'entities',
                 entitites_embeddings_op,
                 l2_coef=0.0
            )['weighted_op'], [b_size, cand_spans, cand_ents // 22, 20, -1]) # [batch_size, max_token, vdim]

            #cell_fw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_lstm // 2)
            #cell_bw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_lstm // 2)
            #(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
            #        cell_fw, cell_bw, output,
            #        sequence_length=lengths, dtype=tf.float32)
            #output = tf.concat([output_fw, output_bw], axis=-1)
            #output = tf.concat([output[:, 0, :], output[:, -1, :]], axis=-1)

            # coeffs = tf.nn.softmax(tf.squeeze(tf.layers.dense(output, 1)))
            # output = tf.reduce_sum(output * coeffs[..., None], 1)
            # self.entity_embeddings = tf.layers.dense(tf.reshape(output, [b_size, cand_spans, cand_ents // 22, 256]), 300)

            #mask = tf.math.logical_not(tf.equal(entities, 0)[:, 1:-1])
            #Q = tf.layers.dense(output, self.args.hidden_size_lstm)  # [batch_size, sequence_length, hidden_dim]
            #K = tf.layers.dense(output, self.args.hidden_size_lstm)  # [batch_size, sequence_length, hidden_dim]
            #V = tf.layers.dense(output, 300)  # [batch_size, sequence_length, n_classes]
            #query_value_attention_seq = tf.keras.layers.Attention()([Q, V, K], [mask, mask])
            #query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(query_value_attention_seq)
            #self.entity_embeddings = tf.reshape(query_value_attention, [b_size, cand_spans, cand_ents // 22, -1])


            # self.entity_embeddings = util.ffnn(self.entity_embeddings, 1, 300, 300, dropout=None)
            self.pure_entity_embeddings = self.entity_embeddings
            if self.args.ent_vecs_regularization.startswith("l2"):  # 'l2' or 'l2dropout'
                self.entity_embeddings = tf.nn.l2_normalize(self.entity_embeddings, dim=3)
                # not necessary since i do normalization in the entity embed creation as well, just for safety
            if self.args.ent_vecs_regularization == "dropout" or \
                            self.args.ent_vecs_regularization == "l2dropout":
                self.entity_embeddings = tf.nn.dropout(self.entity_embeddings, self.dropout)
    def __init__(self, output_folder, predictions_folder, entity_extension=None):
        self.thr = None
        self.output_folder = output_folder
        self.predictions_folder = predictions_folder
        with open(output_folder+"word_char_maps.pickle", 'rb') as handle:
            _, self.id2word, _, self.id2char, _, _ = pickle.load(handle)

        self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension), unique_values=True)
        _, self.wiki_id_name_map = load_wiki_name_id_map()
        self.extra_info = ""
Example #4
0
    def __init__(self,
                 output_folder,
                 predictions_folder,
                 entity_extension=None,
                 gm_bucketing_pempos=None,
                 print_global_voters=False,
                 print_global_pairwise_scores=False):
        self.thr = None
        self.output_folder = output_folder
        self.predictions_folder = predictions_folder
        with open(output_folder + "word_char_maps.pickle", 'rb') as handle:
            _, self.id2word, _, self.id2char, _, _ = pickle.load(handle)

        self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension),
                                        unique_values=True)
        _, self.wiki_id_name_map = load_wiki_name_id_map()
        self.extra_info = ""
        self.gm_bucketing = GMBucketingResults(
            gm_bucketing_pempos) if gm_bucketing_pempos else None
        self.print_global_pairwise_scores = print_global_pairwise_scores
        self.print_global_voters = print_global_voters
Example #5
0
    def add_embeddings_op(self):
        """Defines self.word_embeddings"""
        with tf.variable_scope("words"):
            self.word_embeddings = self.bert(
                inputs=self.bert_inputs, as_dict=True,
                signature="tokens")["sequence_output"][:, 1:-1, ...]
            print("word_embeddings (after lookup) ", self.word_embeddings)

        with tf.variable_scope("entities"):
            from preprocessing.util import load_wikiid2nnid
            self.nentities = len(
                load_wikiid2nnid(extension_name=self.args.entity_extension))
            _entity_embeddings = tf.Variable(tf.constant(
                0.0, shape=[self.nentities, 300]),
                                             name="_entity_embeddings",
                                             dtype=tf.float32,
                                             trainable=True)

            self.entity_embeddings_placeholder = tf.placeholder(
                tf.float32, [self.nentities, 300])
            self.entity_embedding_init = _entity_embeddings.assign(
                self.entity_embeddings_placeholder)

            self.entity_embeddings = tf.nn.embedding_lookup(
                _entity_embeddings,
                self.cand_entities,
                name="entity_embeddings")
            # self.entity_embeddings = util.ffnn(self.entity_embeddings, 1, 300, 300, dropout=None)
            self.pure_entity_embeddings = self.entity_embeddings
            if self.args.ent_vecs_regularization.startswith(
                    "l2"):  # 'l2' or 'l2dropout'
                self.entity_embeddings = tf.nn.l2_normalize(
                    self.entity_embeddings, dim=3)
                # not necessary since i do normalization in the entity embed creation as well, just for safety
            if self.args.ent_vecs_regularization == "dropout" or \
                            self.args.ent_vecs_regularization == "l2dropout":
                self.entity_embeddings = tf.nn.dropout(self.entity_embeddings,
                                                       self.dropout)
Example #6
0
    def add_embeddings_op(self):
        """Defines self.word_embeddings"""
        with tf.variable_scope("words"):
            _word_embeddings = tf.Variable(tf.constant(
                0.0, shape=[self.nwords, 300]),
                                           name="_word_embeddings",
                                           dtype=tf.float32,
                                           trainable=False)

            self.word_embeddings_placeholder = tf.placeholder(
                tf.float32, [self.nwords, 300])
            self.word_embedding_init = _word_embeddings.assign(
                self.word_embeddings_placeholder)

            word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
                                                     self.words,
                                                     name="word_embeddings")
            self.pure_word_embeddings = word_embeddings
            #print("word_embeddings (after lookup) ", word_embeddings)

        with tf.variable_scope("chars"):
            if self.args.use_chars:
                # get char embeddings matrix
                _char_embeddings = tf.get_variable(
                    name="_char_embeddings",
                    dtype=tf.float32,
                    shape=[self.nchars, self.args.dim_char],
                    trainable=True)
                char_embeddings = tf.nn.embedding_lookup(
                    _char_embeddings, self.chars, name="char_embeddings")

                # char_embeddings: tf.float32, shape=[None, None, None, dim_char],
                # shape = (batch size, max length of sentence, max length of word, dim_char)
                # put the time dimension on axis=1
                s = tf.shape(char_embeddings)
                char_embeddings = tf.reshape(
                    char_embeddings,
                    shape=[s[0] * s[1], s[-2], self.args.dim_char])
                # (batch*sent_length, characters of word, dim_char)

                char_lengths = tf.reshape(self.chars_len, shape=[s[0] * s[1]])
                # shape = (batch_size*max_length of sentence)

                # bi lstm on chars
                cell_fw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_char,
                                                  state_is_tuple=True)
                cell_bw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_char,
                                                  state_is_tuple=True)
                _output = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw,
                    cell_bw,
                    char_embeddings,
                    sequence_length=char_lengths,
                    dtype=tf.float32)

                # read and concat output
                _, ((_, output_fw), (_, output_bw)) = _output
                output = tf.concat([output_fw, output_bw], axis=-1)

                # shape = (batch size, max sentence length, char hidden size)
                output = tf.reshape(
                    output, shape=[s[0], s[1], 2 * self.args.hidden_size_char])
                #print("output after char lstm ", output)
                word_embeddings = tf.concat(
                    [word_embeddings, output],
                    axis=-1)  # concatenate word and char embeddings
                #print("word_embeddings with char after concatenation ", word_embeddings)
                # (batch, words, 300+2*100)
        self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)

        with tf.variable_scope("entities"):
            from preprocessing.util import load_wikiid2nnid
            self.nentities = len(
                load_wikiid2nnid(extension_name=self.args.entity_extension))
            _entity_embeddings = tf.Variable(
                tf.constant(0.0, shape=[self.nentities, 300]),
                name="_entity_embeddings",
                dtype=tf.float32,
                trainable=self.args.train_ent_vecs)

            self.entity_embeddings_placeholder = tf.placeholder(
                tf.float32, [self.nentities, 300])
            self.entity_embedding_init = _entity_embeddings.assign(
                self.entity_embeddings_placeholder)

            self.entity_embeddings = tf.nn.embedding_lookup(
                _entity_embeddings,
                self.cand_entities,
                name="entity_embeddings")
            self.pure_entity_embeddings = self.entity_embeddings
            if self.args.ent_vecs_regularization.startswith(
                    "l2"):  # 'l2' or 'l2dropout'
                self.entity_embeddings = tf.nn.l2_normalize(
                    self.entity_embeddings, dim=3)
                # not necessary since i do normalization in the entity embed creation as well, just for safety
            if self.args.ent_vecs_regularization == "dropout" or \
                            self.args.ent_vecs_regularization == "l2dropout":
                self.entity_embeddings = tf.nn.dropout(self.entity_embeddings,
                                                       self.dropout)
Example #7
0
 def __init__(self):
     self._generator = SamplesGenerator()
     self._word2id, self._char2id = build_word_char_maps()
     #self._word2id, self._char2id = build_word_char_maps_restore()  # alternative
     self._wikiid2nnid = util.load_wikiid2nnid(args.entity_extension)
Example #8
0
    def __init__(self, train_args, args):
        self.args = args
        # input pipeline
        self.streaming_samples = StreamingSamples()
        ds = tf.data.Dataset.from_generator(
            self.streaming_samples.gen,
            (
                tf.int64,
                tf.int64,
                tf.int64,
                tf.int64,  #words, words_len, chars, chars_len
                tf.int64,
                tf.int64,
                tf.int64,  # begin_span, end_span, span_len
                tf.int64,
                tf.float32,
                tf.int64
            ),  #cand_entities, cand_entities_scores, cand_entities_len
            (tf.TensorShape([None]), tf.TensorShape(
                []), tf.TensorShape([None, None]), tf.TensorShape([None]),
             tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape(
                 []), tf.TensorShape([None, None]), tf.TensorShape(
                     [None, None]), tf.TensorShape([None])))
        next_element = ds.make_one_shot_iterator().get_next()
        # batch size = 1   i expand the dims now to match the training that has batch dimension
        next_element = [tf.expand_dims(t, 0) for t in next_element]
        next_element = [
            None, *next_element[:-1], None, next_element[-1], None, None, None,
            None
        ]

        # restore model
        print("loading Model:", train_args.output_folder)
        model = Model(train_args, next_element)
        model.build()
        checkpoint_path = model.restore_session("el" if args.el_mode else "ed")
        self.model = model
        if args.hardcoded_thr:
            self.thr = args.hardcoded_thr
            print("threshold used:", self.thr)
        else:
            # optimal threshold recovery from log files.
            # based on the checkpoint selected look at the log file for threshold (otherwise recompute it)
            self.thr = retrieve_optimal_threshold_from_logfile(
                train_args.output_folder, checkpoint_path, args.el_mode)
            print("optimal threshold selected = ", self.thr)

        if args.running_mode == "el_mode":
            args.el_mode = True
        elif args.running_mode == "ed_mode":
            args.el_mode = False

        # convert text to tensors for the NN
        with open(args.experiment_folder + "word_char_maps.pickle",
                  'rb') as handle:
            self.word2id, _, self.char2id, _, _, _ = pickle.load(handle)

        self.wikiid2nnid = load_wikiid2nnid(
            extension_name=args.entity_extension)
        self.nnid2wikiid = reverse_dict(self.wikiid2nnid, unique_values=True)
        _, self.wiki_id_name_map = load_wiki_name_id_map()

        with open(args.experiment_folder + "prepro_args.pickle",
                  'rb') as handle:
            self.prepro_args = pickle.load(handle)
            if args.lowercase_spans_pem:
                self.prepro_args.lowercase_p_e_m = True
                self.prepro_args.lowercase_spans = True
        print("prepro_args:", self.prepro_args)
        self.prepro_args.persons_coreference = args.persons_coreference
        self.prepro_args.persons_coreference_merge = args.persons_coreference_merge
        self.fetchFilteredCoreferencedCandEntities = FetchFilteredCoreferencedCandEntities(
            self.prepro_args)
        prepro_util.args = self.prepro_args

        self.special_tokenized_words = {"``", '"', "''"}
        self.special_words_assertion_errors = 0
        self.gm_idx_errors = 0
        if self.args.el_with_stanfordner_and_our_ed:
            from nltk.tag import StanfordNERTagger
            self.st = StanfordNERTagger(
                '../data/stanford_core_nlp/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz',
                '../data/stanford_core_nlp/stanford-ner-2018-02-27/stanford-ner.jar',
                encoding='utf-8')
        self.from_myspans_to_given_spans_map_errors = 0
Example #9
0
 def __init__(self):
     self._generator = SamplesGenerator()
     self._batcher = TokenBatcher(config.base_folder+"data/vocabulary/"+"vocab_2.txt")
     self._wikiid2nnid = util.load_wikiid2nnid(args.entity_extension)
     self._wikii2summary = util.load_entity_summary_map()
 def __init__(self):
     self._generator = SamplesGenerator()
     self._batcher = create_tokenizer_from_hub_module()
     self._wikiid2nnid = util.load_wikiid2nnid(args.entity_extension)
     self._wikii2summary = util.load_entity_summary_map()