Example #1
0
def load_ELMo_data(filename, seq_len, entity_len):
    vocab_file = "./ELMo_file/vocab.txt"
    batcher = TokenBatcher(vocab_file)
    entity_list, token_list, _ = read_data(filename)

    entity_id_list, token_id_list = [], []
    real_chars_list, seq_lens_list = [], []
    for index in range(len(token_list)):
        token_id_list.append(token_list[index][:seq_len])
        entity_id_list.append(entity_list[index][:entity_len])

        real_seq_len = min(len(token_list[index]), seq_len)
        tmp = [1] * real_seq_len
        [tmp.append(0) for _ in range(len(tmp), seq_len)]
        seq_lens_list.append(real_seq_len)
        real_chars_list.append(tmp)

    entity_pad = batcher.batch_sentences(entity_id_list)
    token_pad = batcher.batch_sentences(token_id_list)

    print("The shape of tokens after loading vocab:", token_pad.shape)

    # 按每条数据打包
    features = []
    for index in range(len(token_list)):
        curr_features = [
            entity_pad[index],
            token_pad[index],
            real_chars_list[index],
            seq_lens_list[index],
        ]
        features.append(curr_features)

    return np.array(features)
class elmo():
    def __init__(self):
        self.vocab_file = 'vocab_small.txt'
        # Location of pretrained LM.  Here we use the test fixtures.
        datadir = os.path.join('pretrained')
        options_file = os.path.join(
            datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json')
        weight_file = os.path.join(
            datadir, 'elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5')

        # Dump the token embeddings to a file. Run this once for your dataset.
        token_embedding_file = 'elmo_token_embeddings.hdf5'
        dump_token_embeddings(self.vocab_file, options_file, weight_file,
                              token_embedding_file)

        self.batcher = TokenBatcher(self.vocab_file)
        # Input placeholders to the biLM.
        self.context_token_ids = tf.placeholder('int32', shape=(None, None))
        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file)
        # Get ops to compute the LM embeddings.
        context_embeddings_op = bilm(self.context_token_ids)
        self.elmo_context_input = weight_layers('input',
                                                context_embeddings_op,
                                                l2_coef=0.0)
        self.elmo_context_output = weight_layers('output',
                                                 context_embeddings_op,
                                                 l2_coef=0.0)

    def get_emb(self, tokenized_context):
        all_tokens = set(['<S>', '</S>'])
        for context_sentence in tokenized_context:
            for token in context_sentence:
                all_tokens.add(token)
        with open(self.vocab_file, 'w') as fout:
            fout.write('\n'.join(all_tokens))
        tf.reset_default_graph()
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            # Create batches of data.
            context_ids = self.batcher.batch_sentences(tokenized_context)
            # Input
            elmo_context_input_ = sess.run(
                [self.elmo_context_input['weighted_op']],
                feed_dict={self.context_token_ids: context_ids})
            # For output
            elmo_context_output_ = sess.run(
                [self.elmo_context_output['weighted_op']],
                feed_dict={self.context_token_ids: context_ids})
        return elmo_context_input_, elmo_context_output_
Example #3
0
def contextualize(sequences):
    batcher = TokenBatcher(vocab_file)

    with tf.Session() as sess:
        # It is necessary to initialize variables once before running inference.
        sess.run(tf.global_variables_initializer())

        # Create batches of data.
        context_ids = batcher.batch_sentences(sequences)

        # Compute ELMo representations (here for the input only, for simplicity).
        elmo_context_output_ = sess.run(
            [elmo_context_output['weighted_op']],
            feed_dict={context_token_ids: context_ids})
    # print(np.array(elmo_context_output_).shape)
    # print(elmo_context_output_) #contextualized embedding vector sequences
    return elmo_context_output_
Example #4
0
class elmo_encoder(object):
    def __init__(self):
        self.max_batch = 120000
        print ("WARNING: Currently max_batch_size of elmo encoder is set to", self.max_batch)
        pass
    
    def build(self, options_file, weight_file, vocab_file, token_embedding_file):
        self._bilm = BidirectionalLanguageModel(
            options_file,
            weight_file,
            use_character_inputs=False,
            embedding_weight_file=token_embedding_file,
            max_batch_size = self.max_batch)
        self._token_batcher = TokenBatcher(vocab_file)
        #self.length = length
    
    # sentences has to list of word lists. [['You', 'see', '?'], ['That', 'is', 'very', 'interesting', '.']]
    def embed_sent_batch(self, sentences, length):
        sentences_tokenid = self._token_batcher.batch_sentences(sentences)
        # s_tokenid = s_tokenid[1:][:-1]
        tf.reset_default_graph()
        processed_sentences_tokenid = []
        length += 2 # Take into account <s> and </s>
        for s_tokenid in sentences_tokenid:
            if (len(s_tokenid) >= length):
                s_tokenid = s_tokenid[:length]
            else:
                s_tokenid = np.pad(s_tokenid, (0, length - len(s_tokenid)), 'constant', constant_values=(0))
            #s_tokenid = np.expand_dims(s_tokenid, axis=0)
            processed_sentences_tokenid.append(s_tokenid)
        batch_size = len(processed_sentences_tokenid)
        processed_sentences_tokenid = np.array(processed_sentences_tokenid)
        # tf
        with tf.device("/cpu:0"):
            context_token_ids = tf.placeholder('int32', shape=(batch_size, length))
            context_embeddings_op = self._bilm(context_token_ids)
            elmo_context_output = weight_layers('output', context_embeddings_op, l2_coef=0.0)['weighted_op']
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            print ('++++++Check_point_1\n')
            with tf.Session(config=config) as sess:
                sess.run([tf.global_variables_initializer()])
                elmo_context_output_ = sess.run([elmo_context_output],feed_dict={context_token_ids: processed_sentences_tokenid})[0]
        #print (elmo_context_output_.shape)
        return elmo_context_output_
Example #5
0
            def elmo(reviews, inputData):
                """
                对每个输入的batcher都动态的生成词向量表示
                """
                # TokenBatcher是生成词表示的batch类
                batcher = TokenBatcher(config.vocabFile)
                with tf.Session() as sess:
                    sess.run(tf.global_variables_initializer())

                    # 生成batch数据
                    inputDataIndex = batcher.batch_sentences(reviews)
                    #print("inputDataIndex:{}".format(inputDataIndex))

                    # 计算ELMo的向量表示
                    elmoInputVec = sess.run(
                        [elmoInput["weighted_op"]],
                        feed_dict={inputData: inputDataIndex})
                    return elmoInputVec
Example #6
0
        def elmo(reviews):
            """
            对每一个输入的batch都动态的生成词向量表示
            """

            #           tf.reset_default_graph()
            # TokenBatcher是生成词表示的batch类
            batcher = TokenBatcher(config.vocabFile)
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())

                # 生成batch数据
                inputDataIndex = batcher.batch_sentences(reviews)

                # 计算ELMo的向量表示
                elmoInputVec = sess.run([elmoInput['weighted_op']],
                                        feed_dict={inputData: inputDataIndex})

            return elmoInputVec
def dump_token_bilm_embeddings(vocab_file, dataset_file, options_file,
                               weight_file, embedding_weight_file, outfile):

    batcher = TokenBatcher(vocab_file)

    ids_placeholder = tf.placeholder('int32', shape=(None, None))

    model = BidirectionalLanguageModel(
        options_file,
        weight_file,
        use_character_inputs=False,
        embedding_weight_file=embedding_weight_file)
    ops = model(ids_placeholder)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        sentence_id = 0
        with open(dataset_file, 'r') as fin, \
                h5py.File(outfile, 'w') as fout:
            for line in fin:
                sentence = line.strip().split()
                token_ids = batcher.batch_sentences([sentence])
                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: token_ids})
                embedding = embeddings[0, :, :, :]
                ds = fout.create_dataset('{}'.format(sentence_id),
                                         embedding.shape,
                                         dtype='float32',
                                         data=embedding)
                # static_token_emb = embedding[0, :, :]
                # first_layer_emb = embedding[1, :, :]
                # final_layer_emb = embedding[2, :, :]
                # avg_emb = np.mean(embedding, axis=0)  # average embedding of the three layers
                sentence_id += 1
                if sentence_id % 500 == 0:
                    print('%.2f%% finished!' %
                          (sentence_id / float(EXAMPLE_COUNT) * 100))
Example #8
0
class Tokenizer(object):
    def __init__(self,
                 vocab_file,
                 max_seq_length,
                 max_token_length=None,
                 stroke_vocab_file=None,
                 tran2sim=False,
                 sim2tran=False):
        self.vocab_file = vocab_file
        self.max_seq_length = max_seq_length
        self.max_token_length = max_token_length

        max_seq_length = self.max_seq_length - 2  # 因會加 <bos> and <eos>,所以 -2
        self.token_batcher = TokenBatcher(self.vocab_file, max_seq_length)
        if max_token_length:
            self.batcher = Batcher(self.vocab_file, self.max_token_length,
                                   max_seq_length, stroke_vocab_file)

        self.convert_config = None
        if tran2sim and sim2tran:
            assert tran2sim != sim2tran
        elif tran2sim:
            self.convert_config = "t2s.json"
        elif sim2tran:
            self.convert_config = "s2t.json"

    def convert(self, text):
        """
    未轉簡繁、轉簡體、轉繁體
    很慢,不建議使用
    """
        if self.convert_config is None:
            return text
        return opencc.convert(text, config=self.convert_config)

    def tokenize(self, text):
        """
    text to token, for example:
    text=‘Pretrained biLMs compute representations useful for NLP tasks.’
    token=['Pretrained', 'biLMs', 'compute', 'representations', 'useful', 'for', 'NLP', 'tasks', '.']
    """
        text = self.convert(text)
        text = tokenize_chinese_chars(text)
        text = text.strip()
        tokens = []
        for word in text.split():
            tokens.extend(self._run_split_on_punc(word))
        return tokens

    def convert_tokens_to_ids(self, tokens):
        return self.token_batcher.batch_sentences([tokens])[0]

    def convert_tokens_to_char_ids(self, tokens):
        """
    tokens: tokenize(text)
    return: shape [max_seq_length * max_token_length]
    """
        # char_ids [max_seq_length, max_token_length]
        char_ids = self.batcher.batch_sentences([tokens])[0]
        # flat_char_ids [max_seq_length * max_token_length]
        flat_char_ids = [
            char_id for sublist in char_ids for char_id in sublist
        ]
        return flat_char_ids

    def _is_punctuation(self, char):
        """Checks whether `chars` is a punctuation character."""
        cp = ord(char)
        # We treat all non-letter/number ASCII as punctuation.
        # Characters such as "^", "$", and "`" are not in the Unicode
        # Punctuation class but we treat them as punctuation anyways, for
        # consistency.
        if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64)
                or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
            return True
        cat = unicodedata.category(char)
        if cat.startswith("P"):
            return True
        return False

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if self._is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1
        return ["".join(x) for x in output]
                                  weight_file,
                                  use_character_inputs=False,
                                  embedding_weight_file=token_embedding_file)

# Get ops to compute the LM embeddings.
context_embeddings_op = bilm(context_token_ids)

elmo_context_top = weight_layers('output_top_only',
                                 context_embeddings_op,
                                 l2_coef=0.0,
                                 use_top_only=True)

elmo_context_output = weight_layers('output',
                                    context_embeddings_op,
                                    l2_coef=0.0)

with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())
    # Create batches of data.
    context_ids = batcher.batch_sentences(sequences)
    # Input token representations.
    elmo_context_top_ = sess.run([elmo_context_top['weighted_op']],
                                 feed_dict={context_token_ids: context_ids})
    # Output token representations.
    elmo_context_output_ = sess.run([elmo_context_output['weighted_op']],
                                    feed_dict={context_token_ids: context_ids})

print(elmo_context_output_
      )  #contextualized embedding vector sequences (all layers)
# Get an op to compute ELMo (weighted average of the internal biLM layers)
# The following calculates a weighted average of the two layers (equal weights). - These are trainable parameters (including weights and gamma), but we probably won't train them unless we're using this as part of inferSent (?)
#TODO to use top layer only, add arg:  use_top_only=True
elmo_emb = weight_layers('input',
                         input_embeddings_op,
                         l2_coef=0.0,
                         use_top_only=top_layer)

batch_size = 32
elmo_size = 1024
with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    # Create batches of data.
    input_ids = batcher.batch_sentences(tokenized_txt)
    #context_ids, weights = batcher.batch_sentences(tokenized_context)
    final_res = np.zeros((len(tokenized_txt), elmo_size), dtype=np.float32)
    #run batches of size 128
    for i in range(0, len(tokenized_txt), batch_size):
        j = i + batch_size if i + batch_size <= len(tokenized_txt) else len(
            tokenized_txt)
        elmo_emb_ = sess.run(elmo_emb['weighted_op'],
                             feed_dict={input_token_ids: input_ids[i:j, :]})
        #perform averaging here ...
        res = np.array(elmo_emb_)
        idx = i
        for x, sen in zip(res, tokenized_txt[i:j]):
            avg = np.sum(x, axis=0) / len(sen)

            final_res[idx] = avg
Example #11
0
class EncoderGenerator(object):
    """receives samples Train or Test samples and encodes everything to numbers ready to
    be transformed to tfrecords. Also filters out candidate entities that are not in the
    entity universe."""
    def __init__(self):
        self._generator = SamplesGenerator()
        self._batcher = TokenBatcher(config.base_folder+"data/vocabulary/"+"vocab_2.txt")
        self._wikiid2nnid = util.load_wikiid2nnid(args.entity_extension)
        self._wikii2summary = util.load_entity_summary_map()

    def set_gmonly_mode(self):
        self._generator.set_gmonly_mode()

    def set_allspans_mode(self):
        self._generator.set_allspans_mode()

    def is_gmonly_mode(self):
        return self._generator.is_gmonly_mode()

    def is_allspans_mode(self):
        return self._generator.is_allspans_mode()

    def process(self, filepath):
        ground_truth_errors_cnt = 0
        cand_entities_not_in_universe_cnt = 0
        samples_with_errors = 0
        for sample in self._generator.process(filepath):
            words = self._batcher.batch_sentences([sample.chunk_words]).tolist()[0]

            ground_truth_enc = [self._wikiid2nnid[gt] if gt in self._wikiid2nnid else self._wikiid2nnid["<u>"]
                            for gt in sample.ground_truth]
            ground_truth_errors_cnt += ground_truth_enc.count(self._wikiid2nnid["<u>"])   # it is always zero

            #print(colored("New sample", 'red'))
            #print(sample)
            if len(sample.begin_gm) != len(sample.end_gm) or \
                len(sample.begin_gm) != len(ground_truth_enc):
                samples_with_errors += 1
                continue
            if isinstance(sample, GmonlySample):
                cand_entities, cand_entities_ids, cand_entities_scores, cand_entities_labels, not_in_universe_cnt = \
                    self._encode_cand_entities_and_labels(
                        sample.cand_entities, sample.cand_entities_scores, sample.ground_truth)

                yield SampleEncoded(chunk_id=sample.chunk_id,
                                    words=words, words_len=len(words) - 2,
                                    begin_spans=sample.begin_gm, end_spans=sample.end_gm, spans_len=len(sample.begin_gm),
                                    cand_entities=cand_entities, cand_entities_ids=cand_entities_ids, cand_entities_scores=cand_entities_scores,
                                    cand_entities_labels=cand_entities_labels,
                                    cand_entities_len=[len(t) // 22 for t in cand_entities],
                                    ground_truth=ground_truth_enc, ground_truth_len=len(sample.ground_truth),
                                    begin_gm=[], end_gm=[])

            elif isinstance(sample, AllspansSample):
                if len(sample.begin_spans) != len(sample.end_spans):
                    samples_with_errors += 1
                    continue
                # for each span i have the gt or the value -1 if this span is not a gm
                # and then i work in the same way as above
                span_ground_truth = []
                gm_spans = list(zip(sample.begin_gm, sample.end_gm))   # [(3, 5), (10, 11), (15, 18)]
                for left, right in zip(sample.begin_spans, sample.end_spans):
                    if (left, right) in gm_spans:
                        span_ground_truth.append(sample.ground_truth[gm_spans.index((left, right))])
                    else:
                        span_ground_truth.append(-1)   # this span is not a gm
                cand_entities, cand_entities_ids, cand_entities_scores, cand_entities_labels, not_in_universe_cnt = \
                    self._encode_cand_entities_and_labels(
                        sample.cand_entities, sample.cand_entities_scores, span_ground_truth)

                yield SampleEncoded(chunk_id=sample.chunk_id,
                                    words=words, words_len=len(words) - 2,
                                    begin_spans=sample.begin_spans, end_spans=sample.end_spans, spans_len=len(sample.begin_spans),
                                    cand_entities=cand_entities, cand_entities_ids=cand_entities_ids, cand_entities_scores=cand_entities_scores,
                                    cand_entities_labels=cand_entities_labels,
                                    cand_entities_len=[len(t) // 22 for t in cand_entities],
                                    ground_truth=ground_truth_enc, ground_truth_len=len(sample.ground_truth),
                                    begin_gm=sample.begin_gm, end_gm=sample.end_gm)

            cand_entities_not_in_universe_cnt += not_in_universe_cnt
        print("ground_truth_errors_cnt =", ground_truth_errors_cnt)
        print("cand_entities_not_in_universe_cnt =", cand_entities_not_in_universe_cnt)
        print("encoder samples_with_errors =", samples_with_errors)



    def _encode_cand_entities_and_labels(self, cand_entities_p, cand_entities_scores_p,
                                        ground_truth_p):
        """receives cand_entities (list of lists), and ground_truth (list) and does the following:
        1) removes cand ent that are not in our universe
        2) creates a label 0, 1 if this candidate is correct or not (i.e. if the span is indeed a
         gold mention (row of candidate entities array) and this specific candidate entity (column
         of candidate entities array) is correct. Returns the filtered cand_entities
        and the corresponding label (they have the same shape)"""
        cand_entities = []
        cand_entities_ids = []
        cand_entities_scores = []
        cand_entities_labels = []
        not_in_universe_cnt = 0
        for cand_ent_l, cand_scores_l, gt in zip(cand_entities_p, cand_entities_scores_p, ground_truth_p):
            ent_l = []
            ids_l = []
            score_l = []
            label_l = []
            for cand_ent, score in zip(cand_ent_l, cand_scores_l):
                if cand_ent in self._wikiid2nnid:  # else continue, this entity not in our universe
                    summary = self._wikii2summary[cand_ent]
                    tokens = list(filter(None, self._batcher.batch_sentences([summary]).tolist()[0]))
                    while len(tokens) < 22:
                        tokens.append(0)
                    ent_l.append(self._wikiid2nnid[cand_ent])
                    ids_l.extend(tokens)
                    score_l.append(score)
                    label_l.append(1 if cand_ent == gt else 0)
                else:
                    not_in_universe_cnt += 1
            cand_entities.append(ent_l)
            cand_entities_ids.append(ids_l)
            cand_entities_scores.append(score_l)
            cand_entities_labels.append(label_l)
        return cand_entities, cand_entities_ids, cand_entities_scores, cand_entities_labels, not_in_universe_cnt
Example #12
0
def train_svm():

    print("Loading texts and labels...")
    # load training set
    train_labels, train_text_ids = \
                  load_labels(train_label_path, label_index, num_classes, one_hot=False)
    train_texts = load_texts(train_data_path, train_text_ids)
    train_size = len(train_texts)
    tokenized_train_texts = tokenize(train_texts)

    # Create a TokenBatcher to map text to token ids
    batcher = TokenBatcher(train_vocab_file)

    # restore the TextCNN model
    print("Restoring TextCNN model...")
    tf.reset_default_graph()
    cnn_path_dir = os.path.dirname(cnn_path)
    meta_file_name = os.listdir(cnn_path_dir)[-1]
    meta_path = os.path.join(cnn_path_dir, meta_file_name)

    sess = tf.InteractiveSession()
    saver = tf.train.import_meta_graph(meta_path)
    saver.restore(sess, tf.train.latest_checkpoint(os.path.dirname(cnn_path)))

    graph = tf.get_default_graph()
    input_tensors = {
        't_real_text': graph.get_tensor_by_name('real_text_input:0'),
        'input_y': graph.get_tensor_by_name('input_y:0')
    }
    feedback = {
        'loss': graph.get_tensor_by_name('loss/add:0'),
        'accuracy': graph.get_tensor_by_name('accuracy/accuracy:0')
    }
    features = graph.get_tensor_by_name('g_conv/Squeeze:0')

    # train svm
    # extract features of texts in training set first
    print(
        "Extracting features of texts in training set with trained text cnn model..."
    )
    all_features = []
    batch_num = train_size // batch_size
    for batch_no in range(batch_num):
        try:
            batch_texts = tokenized_train_texts[batch_no *
                                                batch_size:(batch_no + 1) *
                                                batch_size]
        except IndexError:
            batch_texts = tokenized_train_texts[batch_no * batch_size:]
        batch_texts = batcher.batch_sentences(batch_texts)
        batch_texts = pad_and_cut(batch_texts, MAX_LEN)

        batch_features = sess.run(
            features, feed_dict={input_tensors['t_real_text']: batch_texts})
        all_features.append(batch_features)
    train_features = np.vstack(all_features)

    sess.close()

    # train svm with training set features and labels
    print("Training svm with training set features and labels...")
    start = datetime.now()
    clf = svm.SVC(kernel='linear', C=1, class_weight='balanced')
    clf.fit(train_features, train_labels)

    # save svm model
    if not os.path.exists(os.path.dirname(svm_path)):
        os.mkdir(os.path.dirname(svm_path))

    joblib.dump(clf, svm_path)

    end = datetime.now()
    print("SVM training complete!")
    print("Train report: train time: %f" % (end - start).seconds)
Example #13
0
class ELMoEmbeddings(object):
    def __init__(self, hparams):
        self.hparams = hparams
        self.vocab_path = self.hparams.word_vocab_path
        self.elmo_options_file = self.hparams.elmo_options_file
        self.elmo_weight_file = self.hparams.elmo_weight_file
        self.token_embedding_file = self.hparams.elmo_token_embedding_file

        self.batcher = TokenBatcher(self.vocab_path)
        if not os.path.exists(self.token_embedding_file):
            print("making dump token embeddings")
            self._make_dump_token_embeddings()
            print("finished making dump_token_embeddings")

    def build_embeddings_op(self, context_ids_ph, utterances_ids_ph,
                            context_sentence_ids_ph):

        bilm = BidirectionalLanguageModel(
            self.elmo_options_file,
            self.elmo_weight_file,
            use_character_inputs=False,
            embedding_weight_file=self.token_embedding_file)

        context_emb_op = bilm(context_ids_ph)
        utterances_emb_op = bilm(utterances_ids_ph)
        context_sentence_emb_op = bilm(context_sentence_ids_ph)

        elmo_context_input = weight_layers('input',
                                           context_emb_op,
                                           l2_coef=0.0)
        with tf.variable_scope('', reuse=True):
            elmo_utterances_input = weight_layers('input',
                                                  utterances_emb_op,
                                                  l2_coef=0.0)
            elmo_context_sentence_input = weight_layers(
                'input', context_sentence_emb_op, l2_coef=0.0)

        return (elmo_context_input, elmo_utterances_input,
                elmo_context_sentence_input)

    def get_toknized_data(self, context_batch, utterances_batch,
                          context_sentence_batch):
        # get nltk tokenized data
        # context, utterances, context_sentence

        #context [None, None] -> okay
        #utterances [None, None, None] -> batch_size * num_candidates, max_utterances_len
        #context_sentence [None, None, None] -> batch_size * max_context_len, max_context_sentence_len

        # batch_size
        context_list = []
        for context in context_batch:
            context_list.append(context[0])

        # batch_size * num_candidates
        utterances_list = []
        for utterances in utterances_batch:
            for response in utterances:
                utterances_list.append(response)

        context_sentence_list = []
        for context_sentences in context_sentence_batch:
            for sentence in context_sentences:
                context_sentence_list.append(sentence)

        context_ids = self.batcher.batch_sentences(context_list)
        utterances_ids = self.batcher.batch_sentences(utterances_list)
        context_sentence_ids = self.batcher.batch_sentences(
            context_sentence_list)

        return np.array(context_ids), np.array(utterances_ids), np.array(
            context_sentence_ids)

    def context_sentence_padding(self, elmo_context_sentence_inputs,
                                 tot_context_len):
        #elmo_context_sentence_input_val : 39, max_sentence_len, 256
        # [17, 5, 3, 11, 3] -> 17
        max_sentence_len = np.shape(elmo_context_sentence_inputs)[1]
        max_context_len = max(tot_context_len)

        current_index = 0
        length_index = 0

        batch_context_sentence = []
        each_context_sentence = []

        for i in range(len(elmo_context_sentence_inputs)):
            each_context_sentence.append(elmo_context_sentence_inputs[i])
            current_index += 1
            if current_index == tot_context_len[length_index]:
                length_index += 1
                current_index = 0
                batch_context_sentence.append(each_context_sentence)
                each_context_sentence = []
                continue

        pad_context_sentence = []
        for context_sentences in batch_context_sentence:
            if len(context_sentences) < max_context_len:
                padding_value = np.zeros([
                    max_context_len - len(context_sentences), max_sentence_len,
                    256
                ], np.float32)
                context_sentences = np.concatenate(
                    (context_sentences, padding_value), axis=0)

            pad_context_sentence.append(context_sentences)

        return pad_context_sentence

    def _make_dump_token_embeddings(self):

        dump_token_embeddings(self.vocab_path, self.elmo_options_file,
                              self.elmo_weight_file, self.token_embedding_file)

    def make_placeholders(self):
        context_ids_ph = tf.placeholder(tf.int32, shape=[None, None])
        utterances_ids_ph = tf.placeholder(tf.int32, shape=[None, None])
        context_sentence_ids_ph = tf.placeholder(tf.int32, shape=[None, None])

        return context_ids_ph, utterances_ids_ph, context_sentence_ids_ph
Example #14
0
    # the reuse=True scope reuses weights from the context for the question
    elmo_question_input = weight_layers(
        'input', question_embeddings_op, l2_coef=0.0
    )

elmo_context_output = weight_layers(
    'output', context_embeddings_op, l2_coef=0.0
)
with tf.variable_scope('', reuse=True):
    # the reuse=True scope reuses weights from the context for the question
    elmo_question_output = weight_layers(
        'output', question_embeddings_op, l2_coef=0.0
    )


with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    # Create batches of data.
    context_ids = batcher.batch_sentences(tokenized_context)
    question_ids = batcher.batch_sentences(tokenized_question)

    # Compute ELMo representations (here for the input only, for simplicity).
    elmo_context_input_, elmo_question_input_ = sess.run(
        [elmo_context_input['weighted_op'], elmo_question_input['weighted_op']],
        feed_dict={context_token_ids: context_ids,
                   question_token_ids: question_ids}
    )

Example #15
0
vocab_file = './data/vocab.txt'
options_file = './try/options.json'
weight_file = './try/weights.hdf5'
token_embedding_file = './data/vocab_embedding.hdf5'

batcher = TokenBatcher(vocab_file)
context_token_ids = tf.placeholder('int32', shape=(None, None))
bilm = BidirectionalLanguageModel(options_file,
                                  weight_file,
                                  use_character_inputs=False,
                                  embedding_weight_file=token_embedding_file)

context_embeddings_op = bilm(context_token_ids)
elmo_context_input = weight_layers('input', context_embeddings_op, l2_coef=0.0)

elmo_context_output = weight_layers('output',
                                    context_embeddings_op,
                                    l2_coef=0.0)
with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    # Create batches of data.
    context_ids = batcher.batch_sentences(tokenized_context)

    # Compute ELMo representations (here for the input only, for simplicity).
    elmo_context_input_ = sess.run([elmo_context_input['weighted_op']],
                                   feed_dict={context_token_ids:
                                              context_ids})[0][0]
print(elmo_context_input_.shape, elmo_context_input_)
Example #16
0
                                        question_embeddings_op,
                                        l2_coef=0.0)

elmo_context_output = weight_layers('output',
                                    context_embeddings_op,
                                    l2_coef=0.0)
with tf.variable_scope('', reuse=True):
    # the reuse=True scope reuses weights from the context for the question
    elmo_question_output = weight_layers('output',
                                         question_embeddings_op,
                                         l2_coef=0.0)

with tf.Session() as sess:
    # It is necessary to initialize variables once before running inference.
    sess.run(tf.global_variables_initializer())

    # Create batches of data.
    context_ids = batcher.batch_sentences(tokenized_context)
    question_ids = batcher.batch_sentences(tokenized_question)

    # Compute ELMo representations (here for the input only, for simplicity).
    elmo_context_input_, elmo_question_input_ = sess.run(
        [
            elmo_context_input['weighted_op'],
            elmo_question_input['weighted_op']
        ],
        feed_dict={
            context_token_ids: context_ids,
            question_token_ids: question_ids
        })
def deep_data_prepare(config):
    print('深度学习模型数据准备')
    train_df = pd.read_csv(config.TRAIN_X)
    train_jp = pd.read_csv(config.TRAIN_JP)
    train_en = pd.read_csv(config.TRAIN_EN)
    test_df = pd.read_csv(config.TEST_X)

    char_sw_list = pickle.load(open('../data/char_stopword.pkl', 'rb'))
    word_sw_list = pickle.load(open('../data/word_stopword.pkl', 'rb'))
    # 用词向量
    # 用字向量
    train_x_char = train_df['char']
    train_x_word = train_df['word']
    train_x_sent_word = [w for w in open('../data/sentiment_word.txt')]
    train_x_sent_char = [w for w in open('../data/sentiment_word.txt')]
    train_jp_char = train_jp['char']
    train_jp_word = train_jp['word']
    train_en_char = train_en['char']
    train_en_word = train_en['word']

    train_char = pd.concat((train_x_char, train_jp_char, train_en_char))
    train_word = pd.concat((train_x_word, train_jp_word, train_en_word))
    test_char = test_df['char']
    test_word = test_df['word']

    if config.data_type == 0:
        train_y = train_df['sub_numerical'].values
        train_y = np_utils.to_categorical(train_y,
                                          num_classes=config.n_classes)

    elif config.data_type == 1:
        train_y = train_df['sentiment_value'].values
        train_y = np_utils.to_categorical(train_y,
                                          num_classes=config.n_classes)

    elif config.data_type == 2:
        train_y = np.array(train_df.iloc[:, 6:].values)
    elif config.data_type == 3:
        train_y = train_df.iloc[:, 6:].values
        targets = train_y.reshape(-1)
        one_hot_targets = np.eye(config.n_classes)[targets]
        train_y = one_hot_targets.reshape(-1, 10, config.n_classes)
    elif config.data_type == 4:
        train_y = (train_df['sentiment_value'] + 1).values
        train_y = np_utils.to_categorical(train_y,
                                          num_classes=config.n_classes)
    elif config.data_type == 5:
        train_y = train_df.iloc[:, 4:].values
    else:
        exit('错误数据类别')

    UNK_CHAR = len(char_stoi)
    PAD_CHAR = len(char_stoi) + 1

    UNK_WORD = len(word_stoi)
    PAD_WORD = len(word_stoi) + 1

    def generate_hann_data(df):
        import re
        hann_train_word = np.full(shape=(len(df['word']), config.HANN_SENT,
                                         config.HANN_WORD_LEN),
                                  fill_value=PAD_WORD)
        hann_train_char = np.full(shape=(len(df['char']), config.HANN_SENT,
                                         config.HANN_CHAR_LEN),
                                  fill_value=PAD_CHAR)

        for i, sentences in enumerate(df['word']):
            sentences = re.split(r" 。 | , ", sentences)
            for j, sent in enumerate(sentences):
                if j < config.HANN_SENT:
                    k = 0
                    word_tokens = sent.split()
                    for _, word in enumerate(word_tokens):
                        if k < config.HANN_WORD_LEN and word not in word_sw_list and word in word_stoi:
                            hann_train_word[i, j, k] = word_stoi[word]
                            k += 1

        for i, sentences in enumerate(df['char']):
            sentences = re.split(r" 。 | , ", sentences)
            for j, sent in enumerate(sentences):
                if j < config.HANN_SENT:
                    k = 0
                    word_tokens = sent.split()
                    for _, word in enumerate(word_tokens):
                        if k < config.HANN_CHAR_LEN and word not in char_sw_list and word in char_stoi:
                            hann_train_char[i, j, k] = char_stoi[word]
                            k += 1
        return hann_train_word, hann_train_char

    hann_train_word, hann_train_char = generate_hann_data(train_df)
    hann_test_word, hann_test_char = generate_hann_data(test_df)

    def word2id(train_dialogs, type='char'):
        if type == 'char':
            stoi = char_stoi
            max_len = config.CHAR_MAXLEN
            UNK = UNK_CHAR
            sw_list = set(char_sw_list)
        elif type == 'word':
            stoi = word_stoi
            max_len = config.WORD_MAXLEN
            UNK = UNK_WORD
            sw_list = set(word_sw_list)
        else:
            exit('类型错误')

        train_x = []
        for d in tqdm(train_dialogs):
            d = str(d).split()
            line = []
            for token in d:
                if token in sw_list\
                        or token == ''\
                        or token == ' ':
                    continue
                if token in stoi:
                    line.append(stoi[token])
                else:
                    line.append(UNK)

            train_x.append(line[:max_len])
        return train_x

    # 普通模型数据
    train_x_word = word2id(train_word, type='word')
    train_x_char = word2id(train_char, type='char')
    test_x_char = word2id(test_char, type='char')
    test_x_word = word2id(test_word, type='word')

    train_x_sent_word = word2id(train_x_sent_word, type='word')
    train_x_sent_char = word2id(train_x_sent_char, type='char')
    # rcnn模型数据准备
    UNK_CHAR = PAD_CHAR
    UNK_WORD = PAD_WORD

    train_word_left = [[UNK_WORD] + w[:-1] for w in train_x_word]
    train_word_right = [w[1:] + [UNK_WORD] for w in train_x_word]
    train_char_left = [[UNK_CHAR] + w[:-1] for w in train_x_char]
    train_char_right = [w[1:] + [UNK_CHAR] for w in train_x_char]

    test_word_left = [[UNK_WORD] + w[:-1] for w in test_x_word]
    test_word_right = [w[1:] + [UNK_WORD] for w in test_x_word]
    test_char_left = [[UNK_CHAR] + w[:-1] for w in test_x_char]
    test_char_right = [w[1:] + [UNK_CHAR] for w in test_x_char]

    train_x_char = sequence.pad_sequences(train_x_char,
                                          maxlen=config.CHAR_MAXLEN,
                                          dtype='int32',
                                          padding='post',
                                          truncating='post',
                                          value=UNK_CHAR)
    train_x_word = sequence.pad_sequences(train_x_word,
                                          maxlen=config.WORD_MAXLEN,
                                          dtype='int32',
                                          padding='post',
                                          truncating='post',
                                          value=UNK_WORD)
    train_x_char_left = sequence.pad_sequences(train_char_left,
                                               maxlen=config.CHAR_MAXLEN,
                                               dtype='int32',
                                               padding='post',
                                               truncating='post',
                                               value=UNK_CHAR)
    train_x_word_left = sequence.pad_sequences(train_word_left,
                                               maxlen=config.WORD_MAXLEN,
                                               dtype='int32',
                                               padding='post',
                                               truncating='post',
                                               value=UNK_WORD)
    train_x_char_right = sequence.pad_sequences(train_char_right,
                                                maxlen=config.CHAR_MAXLEN,
                                                dtype='int32',
                                                padding='post',
                                                truncating='post',
                                                value=UNK_CHAR)
    train_x_word_right = sequence.pad_sequences(train_word_right,
                                                maxlen=config.WORD_MAXLEN,
                                                dtype='int32',
                                                padding='post',
                                                truncating='post',
                                                value=UNK_WORD)

    test_x_char = sequence.pad_sequences(test_x_char,
                                         maxlen=config.CHAR_MAXLEN,
                                         dtype='int32',
                                         padding='post',
                                         truncating='post',
                                         value=UNK_CHAR)
    test_x_word = sequence.pad_sequences(test_x_word,
                                         maxlen=config.WORD_MAXLEN,
                                         dtype='int32',
                                         padding='post',
                                         truncating='post',
                                         value=UNK_WORD)
    test_x_char_left = sequence.pad_sequences(test_char_left,
                                              maxlen=config.CHAR_MAXLEN,
                                              dtype='int32',
                                              padding='post',
                                              truncating='post',
                                              value=UNK_CHAR)
    test_x_word_left = sequence.pad_sequences(test_word_left,
                                              maxlen=config.WORD_MAXLEN,
                                              dtype='int32',
                                              padding='post',
                                              truncating='post',
                                              value=UNK_WORD)
    test_x_char_right = sequence.pad_sequences(test_char_right,
                                               maxlen=config.CHAR_MAXLEN,
                                               dtype='int32',
                                               padding='post',
                                               truncating='post',
                                               value=UNK_CHAR)
    test_x_word_right = sequence.pad_sequences(test_word_right,
                                               maxlen=config.WORD_MAXLEN,
                                               dtype='int32',
                                               padding='post',
                                               truncating='post',
                                               value=UNK_WORD)

    print('train_x char shape is: ', train_x_char.shape)
    print('train_x word shape is: ', train_x_word.shape)
    print('test_x char shape is: ', test_x_char.shape)
    print('test_x word shape is: ', test_x_word.shape)

    train = {}
    test = {}
    tokenizer = tokenization.FullTokenizer(vocab_file=config.BERT_VOCAB_FILES,
                                           do_lower_case=False)

    def get_bert_data(corpus):
        input_ids = []
        input_mask = []
        input_segment_ids = []

        for sent in train_df['word'].values:
            sent = ''.join(sent.strip().split())
            tmp_token_ids = tokenizer.convert_tokens_to_ids(
                ['[CLS]'] + tokenizer.tokenize(sent)[:188] + ['[SEP]'])
            tmp_mask = [1] * len(tmp_token_ids)
            tmp_segment_ids = [0] * len(tmp_token_ids)
            if len(tmp_token_ids) < 190:
                tmp_segment_ids.extend([0] * (190 - len(tmp_token_ids)))
                tmp_mask.extend([0] * (190 - len(tmp_token_ids)))
                tmp_token_ids.extend([0] * (190 - len(tmp_token_ids)))
            input_ids.append(tmp_token_ids)
            input_mask.append(tmp_mask)
            input_segment_ids.append(tmp_segment_ids)
        return np.array(input_ids, dtype='int32'), np.array(
            input_mask, dtype='int32'), np.array(input_segment_ids,
                                                 dtype='int32')

    train['token_id'], train['mask_id'], train['type_id'] = get_bert_data(
        train_df['word'].values)
    test['token_id'], test['mask_id'], test['type_id'] = get_bert_data(
        test_df['word'].values)

    train['word'] = train_x_word
    train['char'] = train_x_char
    train['word_sent'] = train_x_sent_word
    train['char_sent'] = train_x_sent_char
    # rcnn
    train['word_left'] = train_x_word_left
    train['word_right'] = train_x_word_right
    train['char_left'] = train_x_char_left
    train['char_right'] = train_x_char_right
    # han
    train['hann_word'] = hann_train_word
    train['hann_char'] = hann_train_char

    test['word'] = test_x_word
    test['char'] = test_x_char
    test['word_left'] = test_x_word_left
    test['word_right'] = test_x_word_right
    test['char_left'] = test_x_char_left
    test['char_right'] = test_x_char_right
    test['hann_word'] = hann_test_word
    test['hann_char'] = hann_test_char

    assert train['word_left'].shape == train['word_right'].shape == train[
        'word'].shape
    assert train['char_left'].shape == train['char_right'].shape == train[
        'char'].shape
    assert test['word_left'].shape == test['word_right'].shape == test[
        'word'].shape
    assert test['char_left'].shape == test['char_right'].shape == test[
        'char'].shape

    batcher = TokenBatcher(config.elmo_word_vocab_file)
    train['elmo_word'] = batcher.batch_sentences(
        [str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']])
    test['elmo_word'] = batcher.batch_sentences(
        [str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']])

    batcher = TokenBatcher(config.elmo_char_vocab_file)
    train['elmo_char'] = batcher.batch_sentences(
        [str(w).split()[:config.CHAR_MAXLEN] for w in train_df['char']])
    test['elmo_char'] = batcher.batch_sentences(
        [str(w).split()[:config.CHAR_MAXLEN] for w in test_df['char']])

    batcher = TokenBatcher(config.elmo_qiuqiu_vocab_file)
    train['elmo_qiuqiu'] = batcher.batch_sentences(
        [str(w).split()[:config.WORD_MAXLEN] for w in train_df['word']])
    test['elmo_qiuqiu'] = batcher.batch_sentences(
        [str(w).split()[:config.WORD_MAXLEN] for w in test_df['word']])

    return train, train_y, test
Example #18
0
def train_cnn(first_use=True):

    print("Loading texts and labels...")
    # load training set
    train_labels, train_text_ids = \
                  load_labels(train_label_path, label_index, num_classes, one_hot=True)
    train_texts = load_texts(train_data_path, train_text_ids)
    train_size = len(train_texts)
    tokenized_train_texts = tokenize(train_texts)

    # Create a TokenBatcher to map text to token ids
    batcher = TokenBatcher(train_vocab_file)

    if first_use:

        # build TextCNN model
        tf.reset_default_graph()
        model_options = {
            'text_length': MAX_LEN,
            'emb_dim': emb_dim,
            'batch_size': batch_size,
            'num_classes': num_classes,
            'bilm_options_file': bilm_options_file,
            'bilm_weight_file': bilm_weight_file,
            'token_embedding_file': token_embedding_file,
            'l2_bilm_lambda': l2_bilm_lambda,
            'l2_cnn_lambda': l2_cnn_lambda
            }

        print("Building TextCNN model")
        cnn = tc.TextCNN_with_elmo(model_options)
        input_tensors, feedback, features = cnn.build_model()
        
        optim = tf.train.AdamOptimizer(learning_rate, beta1)\
                .minimize(feedback['loss'], name='optim')

        saver = tf.train.Saver(max_to_keep=1)

        sess = tf.InteractiveSession()
        sess.run(tf.global_variables_initializer())
        epoch_done = 0

    else:
        tf.reset_default_graph()
        cnn_path_dir = os.path.dirname(cnn_path)
        meta_file_name = os.listdir(cnn_path_dir)[-1]
        epoch_done = int(meta_file_name[18:-5])
        meta_path = os.path.join(cnn_path_dir, meta_file_name)

        sess = tf.InteractiveSession()
        saver = tf.train.import_meta_graph(meta_path)
        saver.restore(sess, tf.train.latest_checkpoint(os.path.dirname(cnn_path)))

        graph = tf.get_default_graph()

        input_tensors = {'t_real_text': graph.get_tensor_by_name('real_text_input:0'),
                     'input_y': graph.get_tensor_by_name('input_y:0')}
        feedback = {'loss': graph.get_tensor_by_name('loss/add:0'),
                    'accuracy': graph.get_tensor_by_name('accuracy/accuracy:0')}
        optim = graph.get_operation_by_name('optim')

        # Reset max_to_keep
        saver = tf.train.Saver(max_to_keep=1)

    # train
    for epoch in [(i + epoch_done + 1) for i in range(epoch_num)]:
        print("<Epoch no. %d>" % epoch)
        # train text cnn model
        print("Training text cnn model...")
        start = datetime.now()
        sum_accuracy = 0.0
        total_loss = 0.0
        
        batch_num = train_size // batch_size
        for batch_no in range(batch_num):
            if batch_no % (batch_num // 10) == 0:
                print("%d0%%" % (batch_no // (batch_num // 10)))
                
            batch_texts = tokenized_train_texts[batch_no * batch_size : (batch_no + 1) * batch_size]
            batch_texts = batcher.batch_sentences(batch_texts)
            batch_texts = pad_and_cut(batch_texts, MAX_LEN)

            batch_labels = train_labels[batch_no * batch_size : (batch_no + 1) * batch_size]

            _, loss, accuracy = \
               sess.run([optim, feedback['loss'], feedback['accuracy']],
                        feed_dict={input_tensors['t_real_text']: batch_texts,
                                   input_tensors['input_y']: batch_labels})

            sum_accuracy += accuracy
            total_loss += loss
            
        avg_accuracy = sum_accuracy / batch_num

        # save the cnn model
        if not os.path.exists(os.path.dirname(cnn_path)):
            os.mkdir(os.path.dirname(cnn_path))
        saver.save(sess, cnn_path, global_step=epoch)

        end = datetime.now()
        print("TextCNN training complete!")
        print("Train report: loss: %d, accuracy: %d, train time: %f" %
              (total_loss, avg_accuracy, (end - start).seconds))
    sess.close()
Example #19
0
# Create a TokenBatcher to map text to token ids.
batcher = TokenBatcher(vocab_file)  # REQUIRED

# Build the Elmo with biLM and weight layers.
elmo = Elmo(
    options_file,
    weight_file,
    token_embedding_file=token_embedding_file,  # REQUIRED
    token_batcher=batcher,  # REQUIRED
    num_output_representations=1,
    requires_grad=False,
    do_layer_norm=False,
    dropout=0.)

# Create batches of data.
context_token_ids = batcher.batch_sentences(tokenized_context,
                                            add_bos_eos=False)
question_token_ids = batcher.batch_sentences(tokenized_question,
                                             add_bos_eos=False)
# numpy.ndarray or cupy.ndarray
# with shape (batchsize, max_length)

if gpu >= 0:
    # transfer the model to the gpu
    chainer.cuda.get_device_from_id(gpu).use()
    elmo.to_gpu()
    # transfer input data to the gpu
    context_token_ids = elmo.xp.asarray(context_token_ids)
    question_token_ids = elmo.xp.asarray(question_token_ids)

# Compute elmo outputs,
# i.e. weighted sum of multi-layer biLM's outputs.
Example #20
0
import bilm
from bilm import TokenBatcher
import model.config as config
import preprocessing.util as util

if __name__ == '__main__':
    entity_batcher = TokenBatcher(config.base_folder + "data/vocabulary/" +
                                  "wiki_vocab.txt")
    with open(
            '/Users/asntr/Projects/university/course_work/end2end_neural_el/data/entities/ent2toks.txt',
            'w+'
    ) as dst, open(
            '/Users/asntr/Projects/university/course_work/end2end_neural_el/data/entities/summary.txt_prep',
            'r') as src:
        entity2summary = util.load_entity_summary_map()
        for i, (k, v) in enumerate(entity2summary.items()):
            tokens = entity_batcher.batch_sentences([v]).tolist()[0]
            dst.write(k + '\t' + ' '.join([str(i) for i in tokens]) + '\n')
tfconfig.gpu_options.per_process_gpu_memory_fraction = 0.8  # maximun alloc gpu50% of MEM
tfconfig.gpu_options.allow_growth = True  #allocate dynamically
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"  #屏蔽warning信息

config = Config()

if __name__ == '__main__':
    datas = json.load(open('../data/test2.json', encoding='utf-8'))
    ndatas = [line.split()[1:] for line in datas[:10]]

    batcher = TokenBatcher(config.vocab_file)  #生成词表示的batch类

    inputData = tf.placeholder('int32', shape=(None, None))

    abilm = BidirectionalLanguageModel(
        config.option_file,
        config.weight_file,
        use_character_inputs=False,
        embedding_weight_file=config.tokenEmbeddingFile)
    inputEmbeddingsOp = abilm(inputData)

    elmoInput = weight_layers('input', inputEmbeddingsOp, l2_coef=0.0)

    sess = tf.Session()
    with sess.as_default():
        sess.run(tf.global_variables_initializer())
        inputids = batcher.batch_sentences(ndatas)  #生成batch数据
        inputvec = sess.run(elmoInput['weighted_op'],
                            feed_dict={inputData: inputids})
        print(inputvec)
    sess.close()
Example #22
0
class ELMo_Utils(object):
    """
    Impements Elmo functions used by downstream task
    Each tokenized sentence is a list of str, with a batch of sentences a list of tokenized sentences (List[List[str]]).

The Batcher packs these into a shape (n_sentences, max_sentence_length + 2, 50) numpy array of character ids, padding on the right with 0 ids for sentences less then the maximum length. The first and last tokens for each sentence are special begin and end of sentence ids added by the Batcher.

The input character id placeholder can be dimensioned (None, None, 50), with both the batch dimension (axis=0) and time dimension (axis=1) determined for each batch, up the the maximum batch size specified in the BidirectionalLanguageModel constructor.

After running inference with the batch, the return biLM embeddings are a numpy array with shape (n_sentences, 3, max_sentence_length, 1024), after removing the special begin/end tokens.
    """

    START_TOKEN = '<S>'
    END_TOKEN = '</S>'
    UNK_TOKEN = '<UNK>'
    PAD_SNT = '<S></S>'
    PAD_SNT_ID = 0

    def __init__(self,
                 elmo_vocab_file,
                 elmo_weight_file,
                 elmo_option_file,
                 use_character_elmo,
                 use_concat_p,
                 question_window,
                 utterance_cache_file='',
                 passage_cache_file='',
                 question_cache_file=''):
        self.logger = logging.getLogger("dial")
        self.utterance_cache = None
        self.passage_cache = None
        self.question_cache = None
        self.need_q_cache = (question_window > 1)
        self.need_p_cache = use_concat_p
        if os.path.exists(elmo_weight_file) and os.path.exists(
                elmo_option_file) and os.path.exists(elmo_vocab_file):
            # the vocab file exported from the corpus
            self.elmo_vocab_file = elmo_vocab_file
            # elmo weight file
            self.elmo_weight_file = elmo_weight_file
            # elmo option file
            self.elmo_option_file = elmo_option_file
            self.utterance_cache_file = utterance_cache_file
            self.passage_cache_file = passage_cache_file
            self.question_cache_file = question_cache_file
            self.use_character_elmo = use_character_elmo
            with open(self.elmo_option_file, 'r') as fin:
                options = json.load(fin)
            self.output_layers = options['lstm']['n_layers'] + 1
            self.output_dim = 2 * options['lstm']['projection_dim']
            self.logger.info("output_layers :{}, output_dim :{}".format(
                self.output_layers, self.output_dim))
            # by default, the bilm use the character_elmo
            if self.use_character_elmo:
                # max_num_char for characters for a token.
                self.elmo_max_num_char = options['char_cnn'][
                    'max_characters_per_token']
                # line 207 https://github.com/allenai/bilm-tf/blob/ebf52c6ec1012a3672247c2d14ff7bcad7fb812b/bilm/data.py
                # the mask for char id is 0
                self.PAD_TOKEN_CHAR_IDS = np.zeros((self.elmo_max_num_char),
                                                   dtype=np.int32).tolist()
                # use subword character first, which shows extra improvements beside the contextual information.
                self.elmo_char_batcher = Batcher(self.elmo_vocab_file,
                                                 self.elmo_max_num_char)
                # language mode with use_character_inputs = True
                self.elmo_bilm = BidirectionalLanguageModel(
                    self.elmo_option_file, self.elmo_weight_file)
            else:
                # use token batcher
                self.elmo_token_batcher = TokenBatcher(self.elmo_vocab_file)
                # use elmo_bilm with use_character_inputs = False
                self.elmo_bilm = BidirectionalLanguageModel(
                    self.elmo_option_file, self.elmo_weight_file)

            self.chk_load_utterance_cache()
            self.chk_load_passage_cache()
            self.chk_load_question_cache()
        else:
            self.logger.warn(
                "elmo_weight_file = {}, elmo_option_file={}, elmo_vocab_file={}"
                .format(elmo_weight_file, elmo_option_file, elmo_vocab_file))

    def chk_load_utterance_cache(self):
        if self.utterance_cache_file and os.path.exists(
                self.utterance_cache_file):
            self.utterance_cache = h5py.File(self.utterance_cache_file, 'r')
            #self.utterance_cache_in_mem = {}
            #self.utterance_cache_in_mem['lm_embeddings'] = self.load_h5(self.utterance_cache['lm_embeddings'])
            #self.utterance_cache_in_mem['lengths'] = self.load_h5_lengths(self.utterance_cache['lengths'])
            #self.utterance_cache_in_mem['mask'] = self.load_h5(self.utterance_cache['mask'])
            self.logger.info(
                "Utterance cache loaded from {}, size = {}".format(
                    self.utterance_cache_file,
                    len(self.utterance_cache['lm_embeddings'].keys())))
        else:
            self.utterance_cache = None

    def load_h5(self, h5group):
        x = []
        for index in range(len(h5group.keys())):
            # https://stackoverflow.com/questions/10274476/how-to-export-hdf5-file-to-numpy-using-h5py
            x.append(h5group['{}'.format(index)][...].tolist())
        return x

    def load_h5_lengths(self, h5group):
        x = []
        for index in range(len(h5group.keys())):
            x.extend(h5group['{}'.format(index)][...].tolist())
        return x

    def chk_load_passage_cache(self):
        if self.need_p_cache:
            if self.passage_cache_file and os.path.exists(
                    self.passage_cache_file):
                self.passage_cache = h5py.File(self.passage_cache_file, 'r')
                self.logger.info("Passage cache loaded from {}".format(
                    self.passage_cache_file))
            else:
                self.passage_cache = None
                self.logger.info(
                    "Passage cache needed from {}, it will build soon.".format(
                        self.passage_cache_file))
        else:
            self.passage_cache = None
            self.logger.info("Passage cache not needed")

    def chk_load_question_cache(self):
        if self.need_q_cache:
            if self.question_cache_file and os.path.exists(
                    self.question_cache_file):
                self.question_cache = h5py.File(self.question_cache_file, 'r')
                self.logger.info("Question cache loaded from {}".format(
                    self.question_cache_file))
            else:
                self.question_cache = None
                self.logger.info(
                    "Question cache needed from {}, it will build soon.".
                    format(self.question_cache_file))
        else:
            self.question_cache = None
            self.logger.info("Question cache not needed")

    def need_build_passage_cache(self):
        return self.need_p_cache and self.passage_cache_file != '' and self.passage_cache == None

    def need_build_question_cache(self):
        return self.need_q_cache and self.question_cache_file != '' and self.question_cache == None

    def cleanup(self):
        if self.utterance_cache:
            self.utterance_cache.close()
        if self.passage_cache:
            self.passage_cache.close()
        if self.question_cache:
            self.question_cache.close()
        self.logger.info("Clean up elmo cahce")

    def get_elmo_char_ids(self, sentences):
        '''
        Given a nested list of tokens(with start and end token), return the character ids
        Arguments:
            sentences: List[List[str]]

        Return: [sentence_num, token_num, max_char_num]
        '''
        return self.elmo_char_batcher.batch_sentences(sentences).tolist()

    def get_elmo_token_ids(self, sentences):
        '''
        Given a nested list of tokens(without start and end token), return the token ids

        Arguments:
           sentemces : List[List[str]]

        Return : [sentence_num, token_num, max_char_num]
        '''
        return self.elmo_token_batcher.batch_sentences(sentences).tolist()

    def get_elmo_emb_op(self, input_ids_place_holder):
        '''
        Given the input ids place holder, reutrn a ops for computing the language model
        {
         'lm_embeddings': embedding_op, (None, 3, None, 1024)
         'lengths': sequence_lengths_op, (None, )
         'mask': op to compute mask (None, None)
        }
        '''
        return self.elmo_bilm(input_ids_place_holder)

    def weight_layers(self,
                      name,
                      bilm_ops,
                      l2_coef=None,
                      use_top_only=False,
                      do_layer_norm=False):
        '''
        Weight the layers of a biLM with trainable scalar weights to compute ELMo representations.
        See more details on https://github.com/allenai/bilm-tf/blob/81a4b54937f4dfb93308f709c1cf34dbb37c553e/bilm/elmo.py
        {
           'weighted_op': op to compute weighted average for output,
           'regularization_op': op to compute regularization term
        }
        '''
        return weight_layers(name, bilm_ops, l2_coef, use_top_only,
                             do_layer_norm)

    @staticmethod
    def prepare_elmo_vocab_file(vocab, elmo_vocab_file):
        sorted_word = sorted(vocab.token_cnt,
                             key=vocab.token_cnt.get,
                             reverse=True)
        with open(elmo_vocab_file, 'w') as f:
            f.write('{}\n'.format(ELMo_Utils.START_TOKEN))
            f.write('{}\n'.format(ELMo_Utils.END_TOKEN))
            f.write('{}\n'.format(ELMo_Utils.UNK_TOKEN))
            for item in sorted_word:
                f.write('%s\n' % item)

    def build_elmo_char_cache(self, snt_dict_file, max_snt_length,
                              output_cache_file):
        """
        Go through all the snts in the dataset, save into the cache
        """
        self.logger.info(
            'Prepare ELMo character embeddings for {} with ELMo_Utils ...'.
            format(snt_dict_file))
        ids_placeholder = tf.placeholder('int32',
                                         shape=(None, max_snt_length,
                                                self.elmo_max_num_char))
        ops = self.elmo_bilm(ids_placeholder)
        config = tf.ConfigProto(allow_soft_placement=True)
        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            with open(snt_dict_file,
                      'r') as fin, h5py.File(output_cache_file, 'w') as fout:
                lm_embeddings_h5 = fout.create_group('lm_embeddings')
                lengths_h5 = fout.create_group('lengths')
                mask_h5 = fout.create_group('mask')
                batch_snts = []
                start_snt_id_in_batch = 0
                SNT_BATCH_SIZE = 10
                for line in tqdm(fin, total=get_num_lines(snt_dict_file)):
                    sentence = line.strip().split()
                    batch_snts.append(sentence)
                    length = len(batch_snts)
                    if length >= SNT_BATCH_SIZE:
                        start_snt_id_in_batch += self.consume_batch_snts(
                            sess, ids_placeholder, ops, batch_snts,
                            max_snt_length, start_snt_id_in_batch,
                            lm_embeddings_h5, lengths_h5, mask_h5)
                        batch_snts = []
                if len(batch_snts) > 0:
                    start_snt_id_in_batch += self.consume_batch_snts(
                        sess, ids_placeholder, ops, batch_snts, max_snt_length,
                        start_snt_id_in_batch, lm_embeddings_h5, lengths_h5,
                        mask_h5)
                    batch_snts = []
                self.logger.info(
                    "Finished ELMo embeddings for {} senencesm in {}".format(
                        start_snt_id_in_batch, output_cache_file))

    def consume_batch_snts(self, sess, ids_placeholder, ops, batch_snts,
                           max_snt_length, start_snt_id_in_batch,
                           lm_embeddings_h5, lengths_h5, mask_h5):
        char_ids = self.get_elmo_char_ids(batch_snts)
        char_ids = [(ids + [self.PAD_TOKEN_CHAR_IDS] *
                     (max_snt_length - len(ids)))[:max_snt_length]
                    for ids in char_ids]
        elmo_ops = sess.run(ops, feed_dict={ids_placeholder: char_ids})
        batch_size = len(batch_snts)
        for i in range(batch_size):
            sentence_id = start_snt_id_in_batch + i
            # self.logger.info("create lm for snt {}".format(sentence_id))
            lm_embeddings_h5.create_dataset(
                '{}'.format(sentence_id),
                elmo_ops['lm_embeddings'].shape[1:],
                dtype='float32',
                data=elmo_ops['lm_embeddings'][i, :, :, :],
                compression="gzip")
            lengths_h5.create_dataset('{}'.format(sentence_id), (1, ),
                                      dtype='int32',
                                      data=elmo_ops['lengths'][i])
            mask_h5.create_dataset('{}'.format(sentence_id),
                                   elmo_ops['mask'].shape[1:],
                                   dtype='int32',
                                   data=elmo_ops['mask'][i],
                                   compression="gzip")
        return batch_size

    # TODO for token level embedding.
    def build_elmo_token_cache(self, snt_dict_file, max_snt_length,
                               output_cache_file):
        pass

    def build_elmo_cache(self, snt_dict_file, max_snt_length,
                         output_cache_file):
        if self.use_character_elmo:
            self.build_elmo_char_cache(snt_dict_file, max_snt_length,
                                       output_cache_file)
        else:
            self.build_elmo_token_cache(snt_dict_file, max_snt_length,
                                        output_cache_file)

        self.logger.info(
            'Finished ELMo embeddings for utterance cache with ELMo_Utils')

    def build_elmo_cache_for_samples(self, dataset, max_p_len, max_q_len):
        if (not self.need_p_cache) and (not self.need_q_cache):
            self.logger.info(
                'No need for ELMo embeddings for concated passage and question with ELMo_Utils'
            )
        else:
            # build graph for getting forward elmo embedding.
            self.logger.info('Build ELMo embeddings for p = {}, q = {}'.format(
                self.need_p_cache, self.need_q_cache))
            self.build_pq_elmo_graph()
            if self.need_p_cache:
                p_out = h5py.File(self.passage_cache_file, 'w')
                p_lm_embeddings_h5 = p_out.create_group('lm_embeddings')
                p_lengths_h5 = p_out.create_group('lengths')
                p_mask_h5 = p_out.create_group('mask')

            if self.need_q_cache:
                q_out = h5py.File(self.question_cache_file, 'w')
                q_lm_embeddings_h5 = q_out.create_group('lm_embeddings')
                q_lengths_h5 = q_out.create_group('lengths')
                q_mask_h5 = q_out.create_group('mask')

            config = tf.ConfigProto(allow_soft_placement=True)
            with tf.Session(config=config) as sess:
                sess.run(tf.global_variables_initializer())
                for set_name in ['train', 'dev', 'test']:
                    for batch_data in tqdm(
                            dataset.gen_mini_batches(set_name,
                                                     20,
                                                     shuffle=False)):
                        samples = batch_data['raw_data']
                        # batch_data is filled with elmo feed_dict
                        self.run_pq_ops(sess, batch_data, max_p_len, max_q_len)
                        for i in range(len(samples)):
                            e_id = '{}'.format(samples[i]['example-id'])
                            try:
                                if self.need_p_cache:
                                    p_lm_embeddings_h5.create_dataset(
                                        e_id,
                                        p_ops['lm_embeddings'].shape[1:],
                                        dtype='float32',
                                        data=p_ops['lm_embeddings'][
                                            i, :, :, :],
                                        compression="gzip")
                                    p_lengths_h5.create_dataset(
                                        e_id, (1, ),
                                        dtype='int32',
                                        data=p_ops['lengths'][i])
                                    p_mask_h5.create_dataset(
                                        e_id,
                                        p_ops['mask'].shape[1:],
                                        dtype='int32',
                                        data=p_ops['mask'][i, :],
                                        compression="gzip")
                                if self.need_q_cache:
                                    q_lm_embeddings_h5.create_dataset(
                                        e_id,
                                        q_ops['lm_embeddings'].shape[1:],
                                        dtype='float32',
                                        data=q_ops['lm_embeddings'][
                                            i, :, :, :],
                                        compression="gzip")
                                    q_lengths_h5.create_dataset(
                                        e_id,
                                        (1, ),
                                        dtype='int32',
                                        data=q_ops['lengths'][i],
                                    )
                                    q_mask_h5.create_dataset(
                                        e_id,
                                        q_ops['mask'].shape[1:],
                                        dtype='int32',
                                        data=q_ops['mask'][i, :],
                                        compression="gzip")
                            except:
                                continue

        self.logger.info(
            'Finished ELMo embeddings for concated passage and question with ELMo_Utils'
        )

    def run_pq_ops(self, sess, batch_data, max_p_len, max_q_len):
        self._static_pq_padding(batch_data, max_p_len, max_q_len)

        if self.need_p_cache and self.need_q_cache:
            self.p_ops, self.q_ops = sess.run(
                [self.p_emb_elmo_op, self.q_emb_elmo_op],
                feed_dict={
                    self.elmo_p: batch_data['elmo_passage_char_ids'],
                    self.elmo_q: batch_data['elmo_question_char_ids']
                })
        elif self.need_p_cache:
            self.p_ops = sess.run(
                [self.p_emb_elmo_op],
                feed_dict={self.elmo_p: batch_data['elmo_passage_char_ids']})
        else:
            self.q_ops = sess.run([self.q_emb_elmo_op],
                                  feed_dict={
                                      self.elmo_q:
                                      batch_data['elmo_question_char_ids'],
                                  })

    def build_pq_elmo_graph(self):
        """
        Given the batch_data, this will seperately run tensorflow get the elmo embedding for each batch, which will be cached into file
        Especially , for sample level cache, please make sure that the first dimension for any tensor is batch_size
        """
        start_t = time.time()
        self.logger.info(
            "Start building elmo graph for concatenated p and q ...")
        self.add_elmo_placeholders()
        with tf.device('/device:GPU:0'):
            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                # get all elmo op with language mode
                # lm_embeddings : [batch_size, layers, max_length, hidden_dims * 2]
                # lengths : [batch_size]
                # mask : [batch_size, length]
                if self.need_p_cache:
                    self.p_emb_elmo_op = self.elmo_bilm(self.elmo_p)

                if self.need_q_cache:
                    # [batch_size, context_window, layers, max_u_length, hidden_dims * 2]
                    self.q_emb_elmo_op = self.elmo_bilm(self.elmo_q)

    def add_elmo_placeholders(self):
        """
        elmo for business, logic corresponding the specific application
        """
        # for ELMo with character embedding
        # elmo passage character ids for each token in each concatenated passage
        # [batch_size, passage_length, char_length]

        if self.need_p_cache:
            self.elmo_p = tf.placeholder(tf.int32,
                                         [None, None, self.elmo_max_num_char],
                                         'elmo_p')
        # elmo character ids for whole concatenated qustion
        # [batch_size, question_length, char_length]
        self.elmo_q = tf.placeholder(tf.int32,
                                     [None, None, self.elmo_max_num_char],
                                     'elmo_q')

    def _static_pq_padding(self, batch_data, max_p_len, max_q_len):
        """
        This is used for static padding, which is useful when the deep contextual embedding is saved with a mask of the whole static length.
        """
        # also padding elmo matrix
        # in elmo, the character ids after batch_sentences contains the start and end token, length for charids +2 while the final embedding not contains those special token.
        # For further compatibility, we still leave elmo length as different length.
        pad_q_len_elmo = 2 + max_q_len
        padding(batch_data, 'elmo_question_char_ids', pad_q_len_elmo,
                self.PAD_TOKEN_CHAR_IDS)

        if self.need_p_cache:
            pad_p_len_elmo = 2 + max_p_len
            padding(batch_data, 'elmo_passage_char_ids', pad_p_len_elmo,
                    self.PAD_TOKEN_CHAR_IDS)

    def _prepare_passage_elmo_feed_dict(self, sample, batch_data,
                                        context_window, token_key_to_use):
        """
        add elmo feed_dict for passage
        """
        e_id_str = '{}'.format(sample['example-id'])
        passage_utterance_tokens_elmo = []
        passage_utterance_length_elmo = []
        passage_tokens_elmo = [ELMo_Utils.START_TOKEN]
        passage_snt_ids = []
        pruned_context_utterances_elmo = sample['messages-so-far'][
            -context_window:]
        for i in range(context_window):
            if i >= len(pruned_context_utterances_elmo):
                current_utterance_tokens_elmo = [
                    ELMo_Utils.START_TOKEN, ELMo_Utils.END_TOKEN
                ]
                passage_snt_ids.append(ELMo_Utils.PAD_SNT_ID)
                passage_utterance_tokens_elmo.append(
                    current_utterance_tokens_elmo)
                passage_utterance_length_elmo.append(
                    len(current_utterance_tokens_elmo))
            else:
                utterance = pruned_context_utterances_elmo[i]
                if 'snt_id' in utterance:
                    passage_snt_ids.append(utterance['snt_id'])
                # split version of passages
                current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN]
                current_utterance_tokens_elmo.extend(
                    utterance[token_key_to_use])
                current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
                passage_utterance_tokens_elmo.append(
                    current_utterance_tokens_elmo)
                passage_utterance_length_elmo.append(
                    len(current_utterance_tokens_elmo))
                # concatenated version of passages
                # append passages utterance tokens
                passage_tokens_elmo.extend(utterance[token_key_to_use])

        passage_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
        if self.need_build_passage_cache():
            # add into batch_data, no other batch data will data
            # [batch_size, passage_length, max_char_num]
            batch_data['elmo_passage_char_ids'].append(
                self.get_elmo_char_ids([passage_tokens_elmo])[0])
        else:
            #TODO add passage and question elmo retrieve here.
            if self.need_p_cache:
                self.assemble_elmo_batch_data('p', batch_data, e_id_str,
                                              self.passage_cache)
            for snt_id in passage_snt_ids:
                # self.assemble_elmo_with_snt_ids('pu', batch_data, snt_id)
                # self.assemble_elmo_batch_data_with_mem('pu', batch_data, snt_id, self.utterance_cache_in_mem)
                self.assemble_elmo_batch_data('pu', batch_data, snt_id,
                                              self.utterance_cache)

    def _prepare_question_elmo_feed_dict(self, sample, batch_data,
                                         question_window, token_key_to_use):
        """
        add question elmo feed_dict according the same style for adding regular question feed_dict
        """
        e_id_str = '{}'.format(sample['example-id'])
        # for each utterance in question
        question_utterance_tokens_elmo = []
        # for the concatenated question
        # for question utterance length
        question_utterance_length_elmo = []
        question_snt_ids = []
        # add start token, which is also in the vocabulary
        # in non-elmo, embedding, we wil add self.vocab.sos and self.vocab.eos in to the sentence,whic will be encoded by the downstream lstm. However, sos and eos are in capital case in the elmo. In fact, we must use Upper case here to get a emebdding from elmo abou it.
        question_tokens_elmo = [ELMo_Utils.START_TOKEN]
        pruned_question_utterance_elmo = sample['messages-so-far'][
            -question_window:]
        for i in range(question_window):
            if i >= len(pruned_question_utterance_elmo):
                current_utterance_tokens_elmo = [
                    ELMo_Utils.START_TOKEN, ELMo_Utils.END_TOKEN
                ]
                question_snt_ids.append(ELMo_Utils.PAD_SNT_ID)
                question_utterance_tokens_elmo.append(
                    current_utterance_tokens_elmo)
                question_utterance_length_elmo.append(
                    len(current_utterance_tokens_elmo))
            else:
                utterance = pruned_question_utterance_elmo[i]
                # split version of question
                if 'snt_id' in utterance:
                    question_snt_ids.append(utterance['snt_id'])
                current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN]
                current_utterance_tokens_elmo.extend(
                    utterance[token_key_to_use])
                current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
                # add each utterance token_ids into a parental list
                question_utterance_tokens_elmo.append(
                    current_utterance_tokens_elmo)
                question_utterance_length_elmo.append(
                    len(current_utterance_tokens_elmo))
                # concatenated version of question
                # append question utterance tokens
                question_tokens_elmo.extend(utterance[token_key_to_use])

        question_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
        if question_window == 0:
            # if note use question, here it will make mistake,
            # bug here. make question at least = 1
            pass
        else:
            # add elmo question tokenids into batch_data
            if self.need_build_question_cache():
                # add into batch_data
                # [batch_size, question_length, max_char_num]
                batch_data['elmo_question_char_ids'].append(
                    self.get_elmo_char_ids([question_tokens_elmo])[0])
            else:
                # if question_window = 1, then juse use utterance cache
                if question_window == 1:
                    # self.assemble_elmo_with_snt_ids('q', batch_data, question_snt_ids[0])
                    # self.assemble_elmo_batch_data_with_mem('q', batch_data, question_snt_ids[0], self.utterance_cache_in_mem)
                    self.assemble_elmo_batch_data('q', batch_data,
                                                  question_snt_ids[0],
                                                  self.utterance_cache)
                else:
                    self.assemble_elmo_batch_data('q', batch_data, e_id_str,
                                                  self.question_cache)

    def _prepare_response_elmo_feed_dict(self, sample, batch_data,
                                         token_key_to_use):
        """
        add question elmo feed_dict according the same style for adding regular question feed_dict
        """
        if 'options-for-correct-answers':
            e_id_str = '{}'.format(sample['example-id'])
            utterance = sample['options-for-correct-answers'][0]
            # split version of question
            current_utterance_tokens_elmo = [ELMo_Utils.START_TOKEN]
            current_utterance_tokens_elmo.extend(utterance[token_key_to_use])
            current_utterance_tokens_elmo.extend([ELMo_Utils.END_TOKEN])
            if 'snt_id' in utterance:
                snt_id = utterance['snt_id']
                self.assemble_elmo_batch_data('r', batch_data, snt_id,
                                              self.utterance_cache)

    def init_elmo_batch_data_sntids(self, batch_data):
        if self.need_p_cache:
            # use elmo cache to retrieve batch_data
            batch_data['elmo_p_lm_embeddings'] = []
            batch_data['elmo_p_lengths'] = []
            batch_data['elmo_p_mask'] = []
        batch_data['elmo_pu_snt_ids'] = []
        batch_data['elmo_q_snt_ids'] = []
        batch_data['elmo_r_snt_ids'] = []

    def init_elmo_batch_data_emb(self, batch_data):
        if self.need_p_cache:
            # use elmo cache to retrieve batch_data
            batch_data['elmo_p_lm_embeddings'] = []
            batch_data['elmo_p_lengths'] = []
            batch_data['elmo_p_mask'] = []

        # for passage_utterance
        batch_data['elmo_pu_lm_embeddings'] = []
        batch_data['elmo_pu_lengths'] = []
        batch_data['elmo_pu_mask'] = []
        # for question
        batch_data['elmo_q_lm_embeddings'] = []
        batch_data['elmo_q_lengths'] = []
        batch_data['elmo_q_mask'] = []
        # for res
        batch_data['elmo_r_lm_embeddings'] = []
        batch_data['elmo_r_lengths'] = []
        batch_data['elmo_r_mask'] = []

    def add_elmo_placeholder_with_cache_sntids(self):
        """
        add placeholders for elmo ops, which will be used in the weight_layers
        """
        if self.need_p_cache:
            self.elmo_p_lm_embeddings = tf.placeholder(
                tf.float32, [None, self.output_layers, None, self.output_dim],
                name='elmp_p_lm_embeddings')
            self.elmo_p_lengths = tf.placeholder(tf.int32, [None],
                                                 name='elmo_p_lengths')
            self.elmo_p_mask = tf.placeholder(tf.int32, [None, None],
                                              name='elmo_p_mask')

        self.elmo_pu_snt_ids = tf.placeholder(tf.int32, [None],
                                              name='elmo_pu_snt_ids')
        self.elmo_q_snt_ids = tf.placeholder(tf.int32, [None],
                                             name='elmo_q_snt_ids')
        self.elmo_r_snt_ids = tf.placeholder(tf.int32, [None],
                                             name='elmo_r_snt_ids')

    def add_elmo_placeholder_with_cache_emb(self):
        """
        add placeholders for elmo ops, which will be used in the weight_layers
        """
        if self.need_p_cache:
            self.elmo_p_lm_embeddings = tf.placeholder(
                tf.float32, [None, self.output_layers, None, self.output_dim],
                name='elmp_p_lm_embeddings')
            self.elmo_p_lengths = tf.placeholder(tf.int32, [None],
                                                 name='elmo_p_lengths')
            self.elmo_p_mask = tf.placeholder(tf.int32, [None, None],
                                              name='elmo_p_mask')

        self.elmo_pu_lm_embeddings = tf.placeholder(
            tf.float32, [None, self.output_layers, None, self.output_dim],
            name='elmo_pu_lm_embeddings')
        self.elmo_pu_lengths = tf.placeholder(tf.int32, [None],
                                              name='elmo_pu_lengths')
        self.elmo_pu_mask = tf.placeholder(tf.int32, [None, None],
                                           name='elmo_pu_mask')
        self.elmo_q_lm_embeddings = tf.placeholder(
            tf.float32, [None, self.output_layers, None, self.output_dim],
            name='elmo_q_lm_embeddings')
        self.elmo_q_lengths = tf.placeholder(tf.int32, [None],
                                             name='elmo_q_lengths')
        self.elmo_q_mask = tf.placeholder(tf.int32, [None, None],
                                          name='elmo_q_mask')
        self.elmo_r_lm_embeddings = tf.placeholder(
            tf.float32, [None, self.output_layers, None, self.output_dim],
            name='elmo_r_lm_embeddings')
        self.elmo_r_lengths = tf.placeholder(tf.int32, [None],
                                             name='elmo_r_lengths')
        self.elmo_r_mask = tf.placeholder(tf.int32, [None, None],
                                          name='elmo_r_mask')

    def prepare_elmo_cache_feed_dict_sntids(self, feed_dict, batch):
        """
        consitently feed the batch_data, we prepared in the prepare_passage_elmo, question_elmo, answer_elmo
        """
        if self.need_p_cache:
            # for elmo_p
            feed_dict[
                self.elmo_p_lm_embeddings] = batch['elmo_p_lm_embeddings']
            feed_dict[self.elmo_p_lengths] = batch['elmo_p_lengths']
            feed_dict[self.elmo_p_mask] = batch['elmo_p_mask']

        # for elmo_q
        feed_dict[self.elmo_q_snt_ids] = batch['elmo_q_snt_ids']
        # for elmo_pu
        feed_dict[self.elmo_pu_snt_ids] = batch['elmo_pu_snt_ids']
        # for elmo_r
        feed_dict[self.elmo_r_snt_ids] = batch['elmo_r_snt_ids']

    def prepare_elmo_cache_feed_dict_emb(self, feed_dict, batch):
        """
        consitently feed the batch_data, we prepared in the prepare_passage_elmo, question_elmo, answer_elmo
        """
        if self.need_p_cache:
            # for elmo_p
            feed_dict[
                self.elmo_p_lm_embeddings] = batch['elmo_p_lm_embeddings']
            feed_dict[self.elmo_p_lengths] = batch['elmo_p_lengths']
            feed_dict[self.elmo_p_mask] = batch['elmo_p_mask']

        # for elmo_q
        feed_dict[self.elmo_q_lm_embeddings] = batch['elmo_q_lm_embeddings']
        feed_dict[self.elmo_q_lengths] = batch['elmo_q_lengths']
        feed_dict[self.elmo_q_mask] = batch['elmo_q_mask']

        # for elmo_pu
        feed_dict[self.elmo_pu_lm_embeddings] = batch['elmo_pu_lm_embeddings']
        feed_dict[self.elmo_pu_lengths] = batch['elmo_pu_lengths']
        feed_dict[self.elmo_pu_mask] = batch['elmo_pu_mask']

        # for elmo_r
        feed_dict[self.elmo_r_lm_embeddings] = batch['elmo_r_lm_embeddings']
        feed_dict[self.elmo_r_lengths] = batch['elmo_r_lengths']
        feed_dict[self.elmo_r_mask] = batch['elmo_r_mask']

    def elmo_embedding_layer_emb(self, elmo_emb_output):
        """
        elmo embedding layers, which will return embedding for p,q,a,pu,qu
        after projections, dim is elmo_emb_output
        if elmo_emb_output == self.output_dim, then no projection will be done
        """
        self.logger.info('build elmo embedding layer')
        if self.need_p_cache:
            p_emb_elmo_op = {
                'lm_embeddings': self.elmo_p_lm_embeddings,
                'lengths': self.elmo_p_lengths,
                'mask': self.elmo_p_mask
            }

        q_emb_elmo_op = {
            'lm_embeddings': self.elmo_q_lm_embeddings,
            'lengths': self.elmo_q_lengths,
            'mask': self.elmo_q_mask
        }

        pu_emb_elmo_op = {
            'lm_embeddings': self.elmo_pu_lm_embeddings,
            'lengths': self.elmo_pu_lengths,
            'mask': self.elmo_pu_mask
        }

        r_emb_elmo_op = {
            'lm_embeddings': self.elmo_r_lm_embeddings,
            'lengths': self.elmo_r_lengths,
            'mask': self.elmo_r_mask
        }

        with tf.device('/device:GPU:1'):
            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                if self.need_p_cache:
                    self.p_elmo_emb = self.weight_layers(
                        'input', p_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.q_elmo_emb = self.weight_layers(
                    'input', q_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.pu_elmo_emb = self.weight_layers(
                    'input', pu_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.r_elmo_emb = self.weight_layers(
                    'input', r_emb_elmo_op, l2_coef=0.0)['weighted_op']
                # do project from elmo embedding into 128 embedding to contact with word embedding.
                if elmo_emb_output == self.output_dim:
                    self.logger.info(
                        "Elmo_emb_output={} is just equal to the output_dim={}, no need to project with fully connected layers for passage and questions"
                        .format(elmo_emb_output, self.output_dim))
                else:
                    self.logger.info(
                        "Elmo_emb_output={}, output_dim={}, project with fully connected layers for question and passage"
                        .format(elmo_emb_output, self.output_dim))
                    if self.need_p_cache:
                        self.p_elmo_emb = tf.contrib.layers.fully_connected(
                            inputs=self.p_elmo_emb,
                            num_outputs=elmo_emb_output,
                            activation_fn=tf.nn.softmax)

                    self.q_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.q_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)
                    self.pu_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.pu_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)
                    self.r_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.r_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)

    def elmo_embedding_layer_sntids(self, elmo_emb_output):
        """
        elmo embedding layers, which will return embedding for p,q,a,pu,qu
        after projections, dim is elmo_emb_output
        if elmo_emb_output == self.output_dim, then no projection will be done
        """
        with tf.device('/cpu:0'), tf.variable_scope('elmo_embedding'):
            self.elmo_lm_embeddings_lookup = tf.get_variable(
                'lm_embeddings_lookup',
                shape=np.shape(self.utterance_cache_in_mem['lm_embeddings']),
                initializer=tf.constant_initializer(
                    self.utterance_cache_in_mem['lm_embeddings']),
                trainable=False)

            self.elmo_lengths_lookup = tf.get_variable(
                'lengths_lookup',
                shape=(np.shape(self.utterance_cache_in_mem['lengths'])),
                initializer=tf.constant_initializer(
                    self.utterance_cache_in_mem['lengths']),
                trainable=False)

            self.elmo_mask_lookup = tf.get_variable(
                'mask_lookup',
                shape=np.shape(self.utterance_cache_in_mem['mask']),
                initializer=tf.constant_initializer(
                    self.utterance_cache_in_mem['mask']),
                trainable=False)

        if self.need_p_cache:
            p_emb_elmo_op = {
                'lm_embeddings': self.elmo_p_embeddings,
                'lengths': self.elmo_p_lengths,
                'mask': self.elmo_p_mask
            }

        q_emb_elmo_op = {
            'lm_embeddings':
            tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup,
                                   self.elmo_q_snt_ids),
            'lengths':
            tf.nn.embedding_lookup(self.elmo_lengths_lookup,
                                   self.elmo_q_snt_ids),
            'mask':
            tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_q_snt_ids)
        }

        pu_emb_elmo_op = {
            'lm_embeddings':
            tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup,
                                   self.elmo_pu_snt_ids),
            'lengths':
            tf.nn.embedding_lookup(self.elmo_lengths_lookup,
                                   self.elmo_pu_snt_ids),
            'mask':
            tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_pu_snt_ids)
        }

        r_emb_elmo_op = {
            'lm_embeddings':
            tf.nn.embedding_lookup(self.elmo_lm_embeddings_lookup,
                                   self.elmo_r_snt_ids),
            'lengths':
            tf.nn.embedding_lookup(self.elmo_lengths_lookup,
                                   self.elmo_r_snt_ids),
            'mask':
            tf.nn.embedding_lookup(self.elmo_mask_lookup, self.elmo_r_snt_ids)
        }

        with tf.device('/device:GPU:1'):
            with tf.variable_scope("", reuse=tf.AUTO_REUSE):
                if self.need_p_cache:
                    self.p_elmo_emb = self.weight_layers(
                        'input', p_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.q_elmo_emb = self.weight_layers(
                    'input', q_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.pu_elmo_emb = self.weight_layers(
                    'input', pu_emb_elmo_op, l2_coef=0.0)['weighted_op']
                self.r_elmo_emb = self.weight_layers(
                    'input', r_emb_elmo_op, l2_coef=0.0)['weighted_op']
                # do project from elmo embedding into 128 embedding to contact with word embedding.
                if elmo_emb_output == self.output_dim:
                    self.logger.info(
                        "Elmo_emb_output={} is just equal to the output_dim={}, no need to project with fully connected layers for question and passage"
                        .format(elmo_emb_output, self.output_dim))
                else:
                    self.logger.info(
                        "Elmo_emb_output={}, output_dim={}, project with fully connected layers for question and passage"
                        .format(elmo_emb_output, self.output_dim))
                    if self.need_p_cache:
                        self.p_elmo_emb = tf.contrib.layers.fully_connected(
                            inputs=self.p_elmo_emb,
                            num_outputs=elmo_emb_output,
                            activation_fn=tf.nn.softmax)

                    self.q_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.q_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)
                    self.pu_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.pu_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)
                    self.r_elmo_emb = tf.contrib.layers.fully_connected(
                        inputs=self.r_elmo_emb,
                        num_outputs=elmo_emb_output,
                        activation_fn=tf.nn.softmax)

    def assemble_elmo_batch_data(self, name, batch_data, id_key, cache):
        lm_embeddings = cache['lm_embeddings']['{}'.format(id_key)][...]
        length = cache['lengths']['{}'.format(id_key)][0]
        mask = cache['mask']['{}'.format(id_key)][...]
        batch_data['elmo_{}_lm_embeddings'.format(name)].append(lm_embeddings)
        batch_data['elmo_{}_lengths'.format(name)].append(length)
        batch_data['elmo_{}_mask'.format(name)].append(mask)

    def assemble_elmo_batch_data_with_mem(self, name, batch_data, id_key,
                                          cache_in_mem):
        """
        id_key is int here, for the snt_id
        """
        lm_embeddings = cache_in_mem['lm_embeddings'][id_key]
        length = cache_in_mem['lengths'][id_key]
        mask = cache_in_mem['mask'][id_key]
        batch_data['elmo_{}_lm_embeddings'.format(name)].append(lm_embeddings)
        batch_data['elmo_{}_lengths'.format(name)].append(length)
        batch_data['elmo_{}_mask'.format(name)].append(mask)

    def assemble_elmo_with_snt_ids(self, name, batch_data, id_key):
        """
        id_key is int here, for the snt_id
        """
        batch_data['elmo_{}_snt_ids'.format(name)].append(id_key)
Example #23
0
def validate():
    print("Validating with validation set...")
    # load validation set
    print("Loading validation set...")
    valid_labels, valid_text_ids = load_labels(valid_label_path,
                                               label_index,
                                               num_classes,
                                               one_hot=False)
    valid_texts = load_texts(valid_data_path, valid_text_ids)
    valid_size = len(valid_texts)
    tokenized_valid_texts = tokenize(valid_texts)

    # Create a TokenBatcher to map text to token ids
    batcher = TokenBatcher(train_vocab_file)

    # restore the TextCNN model
    print("Restoring TextCNN model...")
    tf.reset_default_graph()
    cnn_path = rootdir + r"models\TextCNN_with_elmo\4-classifier\\" \
               + r"label" + str(label_index) \
               + r"\TextCNN_with_elmo"
    cnn_path_dir = os.path.dirname(cnn_path)
    meta_file_name = os.listdir(cnn_path_dir)[-1]
    meta_path = os.path.join(cnn_path_dir, meta_file_name)

    sess = tf.InteractiveSession()
    saver = tf.train.import_meta_graph(meta_path)
    saver.restore(sess, tf.train.latest_checkpoint(os.path.dirname(cnn_path)))

    graph = tf.get_default_graph()
    input_tensors = {
        't_real_text': graph.get_tensor_by_name('real_text_input:0'),
        'input_y': graph.get_tensor_by_name('input_y:0')
    }
    feedback = {
        'loss': graph.get_tensor_by_name('loss/add:0'),
        'accuracy': graph.get_tensor_by_name('accuracy/accuracy:0')
    }
    features = graph.get_tensor_by_name('g_conv/Squeeze:0')

    # Extracting features of texts in validation set
    print("Extracting features of texts in validation set(4 classes)...")
    all_features = []
    batch_num = valid_size // batch_size
    for batch_no in range(batch_num):
        try:
            batch_texts = tokenized_valid_texts[batch_no *
                                                batch_size:(batch_no + 1) *
                                                batch_size]
        except IndexError:
            batch_texts = tokenized_valid_texts[batch_no * batch_size:]
        batch_texts = batcher.batch_sentences(batch_texts)
        batch_texts = pad_and_cut(batch_texts, MAX_LEN)

        batch_features = sess.run(
            features, feed_dict={input_tensors['t_real_text']: batch_texts})
        all_features.append(batch_features)
    valid_features = np.vstack(all_features)

    sess.close()

    # restore the 4-class-svm model
    print("Restoring 4-class svm and predicting labels...")
    svm_path = rootdir + r"models\SVM\4-classifier\\" \
                 + r"\svm_label" + str(label_index) + r".m"
    clf = joblib.load(svm_path)

    # predict labels of validation set with svm
    print("Predicting labels of validation set with svm...")
    predicted = clf.predict(valid_features)
    print(classification_report(valid_labels, predicted))

    acc = accuracy_score(valid_labels, predicted)
    f11 = f1_score(valid_labels, predicted, average=None)
    f12 = f1_score(valid_labels, predicted, average='macro')
    print("f11=", f11)
    print("f12=", f12)
    print("acc=", acc)
Example #24
0
# Create a TokenBatcher to map text to token ids.
batcher = TokenBatcher(vocab_file)  # REQUIRED

# Build the Elmo with biLM and weight layers.
elmo = Elmo(
    options_file,
    weight_file,
    token_embedding_file=token_embedding_file,  # REQUIRED
    token_batcher=batcher,  # REQUIRED
    num_output_representations=1,
    requires_grad=False,
    do_layer_norm=False,
    dropout=0.)

# Create batches of data.
context_token_ids = batcher.batch_sentences(
    tokenized_context, add_bos_eos=False)
question_token_ids = batcher.batch_sentences(
    tokenized_question, add_bos_eos=False)
# numpy.ndarray or cupy.ndarray
# with shape (batchsize, max_length)

if gpu >= 0:
    # transfer the model to the gpu
    chainer.cuda.get_device_from_id(gpu).use()
    elmo.to_gpu()
    # transfer input data to the gpu
    context_token_ids = elmo.xp.asarray(context_token_ids)
    question_token_ids = elmo.xp.asarray(question_token_ids)

# Compute elmo outputs,
# i.e. weighted sum of multi-layer biLM's outputs.
class ProcessData:
    def __init__(self, params):

        self.data_path = params.data_path
        self.params = params

        if params.IS_DEBUG:
            print('debug mode')
            # load data for debugging
            self.train = self.load_data(self.data_path +
                                        self.params.DATA_DEBUG)
            self.dev = self.load_data(self.data_path + self.params.DATA_DEBUG)
            self.test = self.load_data(self.data_path + self.params.DATA_DEBUG)

        else:
            # load data
            self.train = self.load_data(self.data_path +
                                        self.params.DATA_TRAIN)
            self.dev = self.load_data(self.data_path + self.params.DATA_DEV)
            self.test = self.load_data(self.data_path + self.params.DATA_TEST)

        # batcher for ELMo
        if self.params.USE_CHAR_ELMO:
            print('[INFO] character-level ELMo')
            self.batcher = Batcher(self.data_path + self.params.DIC, 50)
        else:
            print('[INFO] cached-token-level ELMo')
            self.batcher = TokenBatcher(self.data_path + self.params.DIC)

        self.dic_size = 0
        with open(self.data_path + self.params.DIC, 'r') as f:
            self.dic = f.readlines()
            self.dic = [x.strip() for x in self.dic]
            self.dic_size = len(self.dic)

        print('[completed] load data, dic_size: ', self.dic_size)

    def load_data(self, file_path):

        with open(file_path, 'rb') as f:
            dataset = pickle.load(f)

        print('load data : ', file_path, len(dataset))

        return dataset

    def get_glove(self):

        print('[load glove] ' + self.params.GLOVE)
        return np.load(self.data_path + self.params.GLOVE)

    """
        inputs: 
            data         : data to be processed (train/dev/test)
            batch_size   : mini-batch size
            is_test      : True, inference stage (ordered input)  (default : False)
            start_index  : start index of mini-batch (will be used when is_test==True)

        return:
            list_q       : [batch, time_step(==MAX_LENGTH_Q)], questions
            list_s       : [batch, MAX_SENTENCES, time_step(==MAX_LENGTH_S)], sentences
            list_graph   : [batch, MAX_SENTENCES+1, MAX_SENTENCES+1], adjacency matrix of graph [question ; sentecens]
            list_l       : [batch], labels
            
            list_len_q   : [batch]. vaild sequecne length
            list_len_s   : [batch, MAX_SENTENCES]. vaild sequecne length
            list_num_s   : [batch], valid number of sentences
    """

    def get_batch(self, data, batch_size, is_test=False, start_index=0):

        list_q, list_s, list_graph, list_l = [], [], [], []
        list_len_q, list_len_s, list_num_s = [], [], []

        index = start_index

        # Get a random batch of encoder and encoderR inputs from data,
        # pad them if needed

        for _ in range(batch_size):

            tmp_list_s, tmp_list_len_s, tmp_list_l = [], [], []
            tmp_list_graph = np.zeros(
                [self.params.MAX_SENTENCES + 1, self.params.MAX_SENTENCES + 1],
                dtype=np.int32)

            if is_test is False:
                # train case -  random sampling
                q, s, i, l = random.choice(data)
                s = s[:self.params.MAX_SENTENCES]
                i = [x for x in i if x < self.params.MAX_SENTENCES]

            else:
                if index >= len(data):
                    # dummy data ( use index 0 data )
                    q, s, i, l = data[
                        0]  # dummy for batch - will not be evaluated
                    s = s[:self.params.MAX_SENTENCES]
                    i = [x for x in i if x < self.params.MAX_SENTENCES]
                    index += 1
                else:
                    # real data
                    q, s, i, l = data[index]
                    s = s[:self.params.MAX_SENTENCES]
                    i = [x for x in i if x < self.params.MAX_SENTENCES]
                    index += 1

            tmp_q = q.copy()
            tmp_q = tmp_q[:(
                self.params.MAX_LENGTH_Q - 3
            )]  # [make room] elmo will add <S>, 0 (last padding), we added <\S>
            tmp_q.append('<\\S>')

            list_q.append(tmp_q)
            list_len_q.append(
                min(len(tmp_q) - 1,
                    self.params.MAX_LENGTH_Q))  # ignore special token </S>

            # add data as many as MAX_ANSWERS
            for tmp_i in range(self.params.MAX_SENTENCES):

                # real data
                if tmp_i < len(s):
                    # Add pad to data & Calculate seq_length (for later use)
                    # negative case will not generate pad array

                    tmp_s = s[tmp_i].copy()
                    tmp_s = tmp_s[:(
                        self.params.MAX_LENGTH_S - 3
                    )]  # elmo will add <S>, 0 (last padding), we added <\S>
                    tmp_s.append('<\\S>')

                    tmp_list_s.append(tmp_s)
                    tmp_list_len_s.append(
                        min(len(tmp_s) - 1, self.params.MAX_LENGTH_S)
                    )  # ignore special token </S>

                    tmp_list_l.append(int(l[tmp_i]))

                else:
                    # Add dummy data (data from index 0)
                    tmp_s = s[0].copy()
                    tmp_s = tmp_s[:(
                        self.params.MAX_LENGTH_S - 3
                    )]  # elmo will add <S>, 0 (last padding), we added <\S>
                    tmp_s.append('<\\S>')

                    tmp_list_s.append(tmp_s)
                    #tmp_list_len_s.append( min(len(tmp_s)-1,self.params.MAX_LENGTH_S) )  # ignore special token </S>
                    tmp_list_len_s.append(0)  # ignore special token </S>
                    tmp_list_l.append(int(l[0]))

            # build graph adj matrix [question;sentences]

            # edge btw question and each sentence ( +1 for question )
            # [ max_sentence +1, max_sentence +1 ]
            tmp_list_graph[0][:len(s) + 1] = 1
            q_offset = 1

            i.append(
                len(s)
            )  # i = index of starting sentence in passage  <- append total length of the sentence
            start_s, end_s = -1, -1

            for sen_index in i:
                start_s = end_s
                end_s = sen_index

                # skipping initial condition
                if (start_s != -1):

                    tmp_same_passage = [
                    ]  # for checking the index of sentence in the same passage
                    # edge btw sentences in the same passage
                    for tmp_i in range(start_s, end_s):

                        if self.params.EDGE_SENTENCE_QUESTION:
                            tmp_list_graph[
                                tmp_i + q_offset][0] = 1  # edge with question

                        if self.params.EDGE_SELF:
                            tmp_list_graph[tmp_i +
                                           q_offset][tmp_i +
                                                     q_offset] = 1  # self edge

                        # edge with neighbor within passage
                        if self.params.EDGE_WITHIN_PASSAGE == 0:
                            if (tmp_i + 1 != end_s):
                                tmp_list_graph[tmp_i + q_offset][
                                    tmp_i + 1 +
                                    q_offset] = 1  # edge with neighbor
                                tmp_list_graph[tmp_i + 1 + q_offset][
                                    tmp_i + q_offset] = 1  # edge with neighbor

                        tmp_same_passage.append(tmp_i + q_offset)

                    # edge fully-connected within passage
                    if self.params.EDGE_WITHIN_PASSAGE == 1:
                        for sent_idx in tmp_same_passage:
                            copy_tmp_same_passage = list(tmp_same_passage)
                            copy_tmp_same_passage.remove(
                                sent_idx
                            )  # self-connection is defined from params.EDGE_SELF
                            tmp_list_graph[sent_idx][
                                copy_tmp_same_passage] = 1  # q_offset is already applied

                    # edge fully-connected among first sentence of the passage
                    if self.params.EDGE_PASSAGE_PASSAGE:
                        tmp_passage_index = list(i)[:-1]  # remove last index
                        tmp_passage_index = [
                            (x + q_offset) for x in tmp_passage_index
                        ]  # q offset

                        for passage_idx in tmp_passage_index:
                            copy_tmp_passage_index = list(tmp_passage_index)
                            copy_tmp_passage_index.remove(
                                passage_idx
                            )  # self-connection is defined from params.EDGE_SELF
                            tmp_list_graph[passage_idx][
                                copy_tmp_passage_index] = 1  # q_offset is already applied

            list_graph.append(tmp_list_graph)
            list_s.append(tmp_list_s)
            list_len_s.append(tmp_list_len_s)
            list_l.append(tmp_list_l)
            list_num_s.append(len(s))

        list_s_reshape = np.reshape(
            list_s, (self.params.batch_size * self.params.MAX_SENTENCES))

        elmo_list_q = self.batcher.batch_sentences(list_q)
        elmo_list_s = self.batcher.batch_sentences(list_s_reshape)

        return elmo_list_q, elmo_list_s, list_graph, list_l, list_len_q, list_len_s, list_num_s
Example #26
0
class Data(object):
    # member variables like dictionaries and lists goes here
    def __init__(self, length=0, use_synonym=False):
        self.para_tuples = [
        ]  # [(sent_id, sent_id, index_of_an_overlapping/synonym_token, index_of_an_overlapping/synonym_token), ... ]
        self.neg_tuples = [
        ]  # [(sent_id, sent_id, index_of_an_overlapping/synonym_token, index_of_an_overlapping/synonym_token), ... ]
        self.token_pair2neg_tuples = {
        }  # {(token_id\, token_id) : set([neg_tuple_id, ...])}
        self.id2sent = [
        ]  # a list of arrays, where each array is a list of token ids (which represent a sentence). # eventually, make this an numpy array
        self.sent2id = {}
        self.paraphrases = set(
            []
        )  # a set of {(sent_id, sent_id), ...} to quickly check whether two sentences are paraphrases or not.
        self.token2sents = {
        }  # reverse index of sentences given tokens. This is a map { token_id : set([(sent_id, index_of_the_token_in_the_sentence), ...]) }.
        self.synonyms = {}  # {token_id : set([token_id, ... ])}
        self.use_synonym = use_synonym
        self.stop_word_ids = set([])
        self.length = length
        # self.batch_sizeK = None # To be readed by tester

        # build token_batcher
        self.word2id = {}
        self.id2word = []

    def build(self, vocab_file, stop_word_file, synonym_file=None):
        # 1. build TokenBatcher
        self.token_batcher = TokenBatcher(vocab_file)
        self.word2id = self.token_batcher._lm_vocab._word_to_id
        self.id2word = self.token_batcher._lm_vocab._id_to_word
        # 2. if synonym_file is not None, populate synonyms (two directions).
        with open(synonym_file, "r") as f:
            for line in f:
                line = line.strip().split("\t")
                if (line[0] in self.word2id and line[2] in self.word2id):
                    id0 = self.word2id[line[0]]
                    id1 = self.word2id[line[2]]
                    if (id1 == id0):
                        continue
                    self.synonyms.setdefault(id0, set()).add(id1)
                    self.synonyms.setdefault(id1, set()).add(id0)

        # 3. if stop_word_file is not None, populate stop_word_ids
        with open(stop_word_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line in self.word2id:
                    self.stop_word_ids.add(self.word2id[line])

    # The dataset is formatted as sentence\tsentence\tlabel
    def load_sentece_pairs(self, data_file_list, bad_words, data_type_list):
        # 1. populate sentence_tuples, update sentences (check stop_word_ids), paraphrases, token2sents.
        # 2. populate negative cases of sentence pairs into neg_tuples, and correspondingly update token2neg_tuples, sentences (check stop_word_ids), token2sents.
        s_len = []
        for data_file, data_type in zip(data_file_list, data_type_list):
            with open(data_file, "rt", encoding="utf-8") as f:
                count = 0
                for line in f:
                    count += 1
                    if (count >= 20000):
                        break
                    line = line.strip().split('\t')
                    label = line[0]
                    if (data_type == "mrpc"):
                        s1 = line[3].split()
                        s2 = line[4].split()
                    else:
                        s1 = line[1].split()
                        s2 = line[2].split()

                    exist_bad_word = False
                    for i in bad_words:
                        if (i in s1 or i in s2):
                            exist_bad_word = True
                    if (exist_bad_word == True):
                        continue

                    # s1_tokenid = self.token_batcher.batch_sentences([s1])[0][1:][:-1]
                    # s2_tokenid = self.token_batcher.batch_sentences([s2])[0][1:][:-1]

                    # 1
                    s1_tokenid = self.token_batcher.batch_sentences([s1])[0]
                    s2_tokenid = self.token_batcher.batch_sentences([s2])[0]

                    # zero-pad/ truncate sentences to self.length
                    #check if len(s1) > self.len
                    s_len.append(len(s1_tokenid))
                    s_len.append(len(s2_tokenid))
                    if (len(s1_tokenid) > self.length or len(s1_tokenid) < 3):
                        print(s1_tokenid, s1)
                        continue
                    if (len(s2_tokenid) > self.length or len(s2_tokenid) < 3):
                        print(s2_tokenid, s2)
                        continue

                    if len(s1_tokenid) > self.length:
                        s1_tokenid = s1_tokenid[:self.length]
                    else:
                        s1_tokenid = np.pad(s1_tokenid,
                                            (0, self.length - len(s1_tokenid)),
                                            'constant',
                                            constant_values=(0))
                    if len(s2_tokenid) > self.length:
                        s2_tokenid = s2_tokenid[:self.length]
                    else:
                        s2_tokenid = np.pad(s2_tokenid,
                                            (0, self.length - len(s2_tokenid)),
                                            'constant',
                                            constant_values=(0))

                    if not (tuple(s1_tokenid) in self.sent2id):
                        self.id2sent.append(s1_tokenid)
                        s1_id = len(self.id2sent) - 1
                        self.sent2id.update({tuple(s1_tokenid): s1_id})
                    else:
                        s1_id = self.sent2id[tuple(s1_tokenid)]
                    if not (tuple(s2_tokenid) in self.sent2id):
                        self.id2sent.append(s2_tokenid)
                        s2_id = len(self.id2sent) - 1
                        self.sent2id.update({tuple(s2_tokenid): s2_id})
                    else:
                        s2_id = self.sent2id[tuple(s2_tokenid)]

                    #update paraphrases, para_tuples, neg_tuples
                    overlap_index_pairs, synonym_index_pairs = self.overlap(
                        s1_tokenid, s2_tokenid)
                    # print(s1_tokenid)
                    # print(s2_tokenid)
                    # print("overlap", overlap_index_pairs)
                    # if synonym_index_pairs:
                    #     print("synonym_index_pairs", synonym_index_pairs)
                    total_index_pairs = overlap_index_pairs + synonym_index_pairs
                    if (label == "1"):
                        self.paraphrases.add((s1_id, s2_id))
                        self.paraphrases.add((s2_id, s1_id))
                        for p in total_index_pairs:
                            sent_tuple = (s1_id, s2_id, p[0], p[1])
                            self.para_tuples.append(sent_tuple)
                    else:
                        for p in total_index_pairs:
                            sent_tuple = (s1_id, s2_id, p[0], p[1])
                            self.neg_tuples.append(sent_tuple)
                            w1 = s1_tokenid[p[0]]
                            w2 = s2_tokenid[p[1]]
                            if w1 in self.stop_word_ids or w2 in self.stop_word_ids:
                                continue
                            self.token_pair2neg_tuples.setdefault(
                                (w1, w2), set()).add(len(self.neg_tuples) - 1)

                    # update token2sents
                    for index, token_id in enumerate(s1_tokenid):
                        if (token_id == 2 or token_id == 1):
                            continue
                        sid_index = (s1_id, index)
                        self.token2sents.setdefault(token_id,
                                                    set()).add(sid_index)
                    for index, token_id in enumerate(s2_tokenid):
                        if (token_id == 2 or token_id == 1):
                            continue
                        sid_index = (s2_id, index)
                        self.token2sents.setdefault(token_id,
                                                    set()).add(sid_index)
        self.neg_tuples, self.para_tuples, self.id2sent = np.array(
            self.neg_tuples), np.array(self.para_tuples), np.array(
                self.id2sent)
        s_len = np.array(s_len)
        print("s length", np.min(s_len), np.max(s_len), np.mean(s_len),
              np.median(s_len))

    def overlap(self, s1, s2):
        # check intersection
        s1_dict = dict((k, i) for i, k in enumerate(s1))
        s2_dict = dict((k, i) for i, k in enumerate(s2))
        word_pairs = []
        inter = set(s1_dict).intersection(set(s2_dict))
        if (1 in inter):
            inter.remove(1)
        if (2 in inter):
            inter.remove(2)
        if (0 in inter):
            inter.remove(0)
        inter.difference_update(self.stop_word_ids)
        # check digit
        for i in inter.copy():
            if (self.id2word[i].isdigit()):
                inter.remove(i)
            if (self.id2word[i].startswith('-')):
                inter.remove(i)
        for w in inter:
            w1_id = s1_dict[w]
            w2_id = s2_dict[w]
            word_pairs.append([w1_id, w2_id])

        synonym_pairs = []
        if self.use_synonym:
            for id in s1_dict.keys():
                if id in self.synonyms:
                    for s in self.synonyms[id]:
                        if s in s2_dict.keys():
                            synonym_pairs.append((s1_dict[id], s2_dict[s]))
            # print(synonym_pairs)
            for id in s2_dict.keys():
                if id in self.synonyms:
                    for s in self.synonyms[id]:
                        if s in s1_dict.keys():
                            synonym_pairs.append((s1_dict[s], s2_dict[id]))
            # print(synonym_pairs)
            # print("------")
        synonym_pairs = list(set(synonym_pairs))
        return word_pairs, synonym_pairs

    def corrupt(self, para_tuple, tar=None):
        # corrupt para tuple into a negative sample. Return (sent_id, sent_id, index_of_an_overlapping/synonym_token, index_of_an_overlapping/synonym_token) for a negative sample.
        if tar == None:
            tar = random.randint(0, 1)
        s1 = para_tuple[0]
        s1_index = para_tuple[2]
        s2 = para_tuple[1]
        s2_index = para_tuple[3]

        if (tar == 0):
            token = self.id2sent[s1][s1_index]
            sents_list = self.token2sents[token]

            if ((s1, s1_index) in sents_list):
                sents_list.remove((s1, s1_index))
            if ((s2, s2_index) in sents_list):
                sents_list.remove((s2, s2_index))
            if (len(sents_list) == 0):
                return random.choice(self.neg_tuples)
            else:
                corrupt_s = random.choice(list(sents_list))
            ind = 0
            while self.is_paraphrase(corrupt_s[0], s1):
                corrupt_s = random.choice(list(sents_list))
                ind += 1
                if ind > 10:
                    # print("ind", ind)
                    random.choice(self.neg_tuples)
                    break
            return (corrupt_s[0], s1, corrupt_s[1], s1_index)

        if (tar == 1):
            token = self.id2sent[s2][s2_index]
            sents_list = self.token2sents[token]

            if ((s1, s1_index) in sents_list):
                sents_list.remove((s1, s1_index))
            if ((s2, s2_index) in sents_list):
                sents_list.remove((s2, s2_index))
            if (len(sents_list) < 2):
                return random.choice(self.neg_tuples)
            else:
                corrupt_s = random.choice(list(sents_list))
            ind = 0
            while self.is_paraphrase(corrupt_s[0], s2):
                corrupt_s = random.choice(list(sents_list))
                ind += 1
                if ind > 10:
                    # print("ind", ind)
                    random.choice(self.neg_tuples)
                    break
            c_tuple = (corrupt_s[0], s2, corrupt_s[1], s2_index)
            return c_tuple

    def neg(self, para_tuple):
        s1 = para_tuple[0]
        s1_index = para_tuple[2]
        s2 = para_tuple[1]
        s2_index = para_tuple[3]
        s1_token = self.id2sent[s1][s1_index]
        s2_token = self.id2sent[s2][s2_index]
        if ((s1_token, s2_token) in self.token_pair2neg_tuples):
            neg_tuple_id = random.choice(
                list(self.token_pair2neg_tuples[(s1_token, s2_token)]))
            neg_tuple = self.neg_tuples[neg_tuple_id]
            return neg_tuple
        else:
            return None

    def corrupt_n(self, para_tuple, n=2):
        # in case we use logistic loss, use the corrupt function n times to generate and return n negative samples. Before each corruption, the random seed needs to be reset.
        corrupt_tuples = []
        for i in range(n):
            random.seed(datetime.now())
            corrupt_tuple = self.corrupt(para_tuple)
            if not corrupt_tuple:
                return None
            else:
                corrupt_tuples.append(corrupt_tuple)
        return corrupt_tuples

    def is_synonym(self, token_id1, token_id2):
        if (token_id1 in self.synonyms(token_id2)):
            return True
        else:
            return False

    def is_paraphrase(self, sent_id1, sent_id2):
        if ((sent_id1, sent_id2) in self.paraphrases):
            return True
        else:
            return False

    def save(self, filename):
        f = open(filename, 'wb')
        #self.desc_embed = self.desc_embed_padded = None
        pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL)
        f.close()
        print("Save data object as", filename)

    def load(self, filename):
        f = open(filename, 'rb')
        tmp_dict = pickle.load(f)
        self.__dict__.update(tmp_dict)
        print("Loaded data object from", filename)
        print(
            "===============\nCaution: need to reload desc embeddings.\n====================="
        )
Example #27
0
class PreTrainElmoProcess(object):
    def __init__(self, path=embedding_path, embedding_dim=512,
                 sentence_len=max_sentence_len, pair_mode=False):
        embeddings = dict()

        self.embedding_path = path
        self.embedding_dim = embedding_dim
        self.sentence_len = sentence_len
        self.pair_mode = pair_mode
        self.embedding_dict = embeddings

        g_elmo = tf.Graph()
        vocab_file = './bilmelmo/data/vocab.txt'
        options_file = './bilmelmo/try/options.json'
        weight_file = './bilmelmo/try/weights.hdf5'
        token_embedding_file = './bilmelmo/data/vocab_embedding.hdf5'

        with tf.Graph().as_default() as g_elmo:
            self.batcher = TokenBatcher(vocab_file)
            self.context_token_ids = tf.placeholder('int32', shape=(None, None))
            self.bilm = BidirectionalLanguageModel(
                options_file,
                weight_file,
                use_character_inputs=False,
                embedding_weight_file=token_embedding_file
            )

            self.context_embeddings_op = self.bilm(self.context_token_ids)
            self.elmo_context_input = weight_layers('input', self.context_embeddings_op, l2_coef=0.0)

            self.elmo_context_output = weight_layers(
                'output', self.context_embeddings_op, l2_coef=0.0
            )
            init = tf.global_variables_initializer()
        sess_elmo = tf.Session(graph=g_elmo)
        sess_elmo.run(init)
        self.sess_elmo = sess_elmo

    def encode(self, sentence, **kwargs):
        if 'pair_mode' in kwargs.keys():
            if not isinstance(kwargs['pair_mode'], bool):
                raise TypeError("mode type must bool!")

        if 'pair_mode' in kwargs.keys() and kwargs['pair_mode']:
            try:
                assert isinstance(sentence, list)
            except AssertionError:
                print("sentence must be list!")
        else:
            try:
                assert isinstance(sentence, list)
                embedding_unk = [0.0 for _ in range(self.embedding_dim)]
                out_put = []

                for sentence_idx, _sentence in enumerate(sentence):

                    context_ids = self.batcher.batch_sentences(list(_sentence))
                    out_put_tmp = self.sess_elmo.run(
                        [self.elmo_context_input['weighted_op']],
                        feed_dict={self.context_token_ids: context_ids}
                    )[0][0].tolist()

                    for i in range(self.sentence_len - len(out_put_tmp)):
                        out_put_tmp.append(embedding_unk)

                    out_put_tmp = np.stack(out_put_tmp, axis=0)
                    out_put.append(out_put_tmp)

                return np.stack(out_put, axis=0)
            except AssertionError:
                print("sentence must be list!")
Example #28
0
apply_ops = optimizer.apply_gradients(gvs)

tvars = tf.trainable_variables()

acc = tf.metrics.accuracy(labels=tf.argmax(y_label, axis=2),
                          predictions=idx_output)

#output model
data_path = './NER_models'
model_save_name = 'NERModel'
final_model = os.path.join(data_path, model_save_name)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())  #for tf.metrics
    ids = batcher.batch_sentences(tokenized_sentences)
    ids_val = batcher.batch_sentences(tokenized_sentences_val)
    batch_size = ids.shape[0] // 128  #128 is the max
    batch_size_val = ids_val.shape[0] // 128
    saver = tf.train.Saver()
    for step in range(1000):
        for i in range(batch_size + 1):
            if (i < batch_size):
                s_index = i * 128
                e_index = (i + 1) * 128
                ids_i = ids[s_index:e_index]
                y_i = y[s_index:e_index]
                elmo_input_ = sess.run(elmo_input['weighted_op'],
                                       feed_dict={token_ids: ids_i})
                loss_, _ = sess.run([loss, apply_ops],
                                    feed_dict={
Example #29
0
class Model(object):
    def __init__(self, config):
        self.lr = config["lr"]
        self.input_dropout = config["dropout"]
        self.lstm_dim = config["lstm_dim"]
        self.layer_type = config["layer_type"]
        self.use_attention = config["attention"]
        self.num_attention_heads = config['num_attention_heads']
        self.size_per_head = config['size_per_head']
        self.num_tags = 7
        self.char_dim = 300
        self.global_step = tf.Variable(0, trainable=False)
        self.best_dev_f1 = tf.Variable(0.0, trainable=False)
        self.initializer = initializers.xavier_initializer()

        # elmo
        self.batcher = TokenBatcher(config['vocab_file'])
        # Input placeholders to the biLM.
        self.context_token_ids = tf.placeholder('int32', shape=(None, None))
        # Build the biLM graph.
        self.bilm = BidirectionalLanguageModel(
            config['options_file'],
            config['weight_file'],
            use_character_inputs=False,
            embedding_weight_file=config['token_embedding_file'])
        self.context_embeddings_op = self.bilm(self.context_token_ids)
        self.elmo_context_input = weight_layers('input',
                                                self.context_embeddings_op,
                                                l2_coef=0.0)['weighted_op']

        # add placeholders for the model
        self.mask_inputs = tf.placeholder(dtype=tf.int32,
                                          shape=[None, None],
                                          name="ChatInputs")
        self.targets = tf.placeholder(dtype=tf.int32,
                                      shape=[None, None],
                                      name="Targets")

        # dropout keep prob
        self.dropout = tf.placeholder(dtype=tf.float32, name="Dropout")
        used = tf.sign(tf.abs(self.mask_inputs))
        length = tf.reduce_sum(used, reduction_indices=1)
        self.lengths = tf.cast(length, tf.int32)
        self.batch_size = tf.shape(self.mask_inputs)[0]
        self.num_steps = tf.shape(self.mask_inputs)[-1]

        self.logits = self.inference(self.elmo_context_input)
        # loss of the model
        self.loss = self.loss_layer(self.logits, self.lengths)
        self.train_op = self.train(self.loss)
        # saver of the model
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

    def loss(self, embedding):
        logits = self.inference(embedding)
        loss = self.loss_layer(logits, self.logits)
        return loss

    def train(self, loss):
        with tf.variable_scope("optimizer"):
            opt = tf.train.AdamOptimizer(self.lr)
            # apply grad clip to avoid gradient explosion
            grads_vars = opt.compute_gradients(loss)
            capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v]
                                 for g, v in grads_vars]
            train_op = opt.apply_gradients(capped_grads_vars, self.global_step)
            return train_op

    def single_biLSTM_layer(self, model_inputs, lstm_dim, lengths):
        """
        :param lstm_inputs: [batch_size, num_steps, emb_size]
        :return: [batch_size, num_steps, 2*lstm_dim]
        """
        with tf.variable_scope("first_layer"):
            first_fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                         state_is_tuple=True)
            # 添加dropout.为了防止过拟合,在它的隐层添加了 dropout 正则
            first_fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                first_fw_lstm_cell, output_keep_prob=self.dropout)
            first_bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                         state_is_tuple=True)
            first_bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                first_bw_lstm_cell, output_keep_prob=self.dropout)
            first_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                first_fw_lstm_cell,
                first_bw_lstm_cell,
                model_inputs,
                sequence_length=lengths,
                dtype=tf.float32)
            output = tf.concat(first_outputs, -1)
        return output

    def concat_biLSTM_layer(self, model_inputs, lstm_dim, lengths):
        """
        :param lstm_inputs: [batch_size, num_steps, emb_size]
        :return: [batch_size, num_steps, 2*lstm_dim]
        """
        with tf.variable_scope("first_layer"):
            first_fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                         state_is_tuple=True)
            # 添加dropout.为了防止过拟合,在它的隐层添加了 dropout 正则
            first_fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                first_fw_lstm_cell, output_keep_prob=self.dropout)
            first_bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                         state_is_tuple=True)
            first_bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                first_bw_lstm_cell, output_keep_prob=self.dropout)
            first_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                first_fw_lstm_cell,
                first_bw_lstm_cell,
                model_inputs,
                sequence_length=lengths,
                dtype=tf.float32)
            first_layer_output = tf.concat(first_outputs, -1)

        with tf.variable_scope("second_layer"):
            second_fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                          state_is_tuple=True)
            # 添加dropout.为了防止过拟合,在它的隐层添加了 dropout 正则
            second_fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                second_fw_lstm_cell, output_keep_prob=self.dropout)
            second_bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                          state_is_tuple=True)
            second_bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                second_bw_lstm_cell, output_keep_prob=self.dropout)
            second_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                second_fw_lstm_cell,
                second_bw_lstm_cell,
                first_layer_output,
                sequence_length=lengths,
                dtype=tf.float32)
            second_layer_output = tf.concat(second_outputs, -1)

        return tf.concat([first_layer_output, second_layer_output], axis=-1)

    def stack_biLSTM_layer(self, model_inputs, lstm_dim, lengths):
        """
        :param lstm_inputs: [batch_size, num_steps, emb_size]
        :return: [batch_size, num_steps, 2*lstm_dim]
        """
        fw_lstms, bw_lstms = [], []
        for _ in range(2):
            fw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                   state_is_tuple=True)
            # 添加dropout.为了防止过拟合,在它的隐层添加了 dropout 正则
            fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                fw_lstm_cell, output_keep_prob=self.dropout)
            fw_lstms.append(fw_lstm_cell)

            bw_lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_dim,
                                                   state_is_tuple=True)
            bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(
                bw_lstm_cell, output_keep_prob=self.dropout)
            bw_lstms.append(bw_lstm_cell)
        outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
            fw_lstms,
            bw_lstms,
            model_inputs,
            sequence_length=lengths,
            dtype=tf.float32)
        return outputs

    def project_layer_bilstm(self, lstm_outputs, num):
        """
        hidden layer between lstm layer and logits
        :param lstm_outputs: [batch_size, num_steps, emb_size]
        :return: [batch_size, num_steps, num_tags]
        """
        with tf.variable_scope("project"):
            with tf.variable_scope("attention"):
                if self.use_attention:
                    attention_outputs = attention_layer(
                        lstm_outputs, lstm_outputs, self.mask_inputs,
                        self.num_attention_heads, self.size_per_head)
                else:
                    attention_outputs = lstm_outputs
            with tf.variable_scope("hidden"):
                if self.use_attention:
                    w_shape = [
                        self.num_attention_heads * self.size_per_head,
                        self.lstm_dim
                    ]
                    output_shape = [
                        -1, self.num_attention_heads * self.size_per_head
                    ]
                else:
                    w_shape = [self.lstm_dim * num, self.lstm_dim]
                    output_shape = [-1, self.lstm_dim * num]
                W = tf.get_variable("W",
                                    shape=w_shape,
                                    dtype=tf.float32,
                                    initializer=self.initializer)
                b = tf.get_variable("b",
                                    shape=[self.lstm_dim],
                                    dtype=tf.float32,
                                    initializer=tf.zeros_initializer())
                output = tf.reshape(attention_outputs, shape=output_shape)
                hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b))

            # project to score of tags
            with tf.variable_scope("logits"):
                W = tf.get_variable("W",
                                    shape=[self.lstm_dim, self.num_tags],
                                    dtype=tf.float32,
                                    initializer=self.initializer)
                b = tf.get_variable("b",
                                    shape=[self.num_tags],
                                    dtype=tf.float32,
                                    initializer=tf.zeros_initializer())
                pred = tf.nn.xw_plus_b(hidden, W, b)
            return tf.reshape(pred, [-1, self.num_steps, self.num_tags])

    def inference(self, embedding):
        model_inputs = tf.nn.dropout(embedding, self.dropout)
        if self.layer_type == 'single':
            model_outputs = self.single_biLSTM_layer(model_inputs,
                                                     self.lstm_dim,
                                                     self.lengths)
            logits = self.project_layer_bilstm(model_outputs, 2)
        elif self.layer_type == 'stack':
            model_outputs = self.stack_biLSTM_layer(model_inputs,
                                                    self.lstm_dim,
                                                    self.lengths)
            logits = self.project_layer_bilstm(model_outputs, 2)
        else:
            model_outputs = self.concat_biLSTM_layer(model_inputs,
                                                     self.lstm_dim,
                                                     self.lengths)
            logits = self.project_layer_bilstm(model_outputs, 4)
        return logits

    def loss_layer(self, project_logits, lengths):
        """
        calculate crf loss
        :param project_logits: [1, num_steps, num_tags]
        :return: scalar loss
        """
        with tf.variable_scope("crf_loss"):
            small = -1000.0
            # pad logits for crf loss
            start_logits = tf.concat([
                small * tf.ones(shape=[self.batch_size, 1, self.num_tags]),
                tf.zeros(shape=[self.batch_size, 1, 1])
            ],
                                     axis=-1)
            pad_logits = tf.cast(
                small * tf.ones([self.batch_size, self.num_steps, 1]),
                tf.float32)
            logits = tf.concat([project_logits, pad_logits], axis=-1)
            logits = tf.concat([start_logits, logits], axis=1)
            targets = tf.concat([
                tf.cast(self.num_tags * tf.ones([self.batch_size, 1]),
                        tf.int32), self.targets
            ],
                                axis=-1)

            self.trans = tf.get_variable(
                "transitions",
                shape=[self.num_tags + 1, self.num_tags + 1],
                initializer=self.initializer)
            log_likelihood, self.trans = crf_log_likelihood(
                inputs=logits,
                tag_indices=targets,
                transition_params=self.trans,
                sequence_lengths=lengths + 1)
            loss = tf.reduce_mean(-log_likelihood)
            return loss

    def create_feed_dict(self, is_train, batch):
        str_input, masks, tags = batch
        token_ids = self.batcher.batch_sentences(str_input)
        feed_dict = {
            self.context_token_ids: np.asarray(token_ids),
            self.mask_inputs: np.asarray(masks),
            self.dropout: 1.0
        }
        if is_train:
            feed_dict[self.targets] = np.asarray(tags)
            feed_dict[self.dropout] = self.input_dropout
        return feed_dict

    def run_step(self, sess, is_train, batch):
        """
        :param sess: session to run the batch
        :param is_train: a flag indicate if it is a train batch
        :param batch: a dict containing batch data
        :return: batch result, loss of the batch or logits
        """
        feed_dict = self.create_feed_dict(is_train, batch)
        if is_train:
            global_step, loss, _ = sess.run(
                [self.global_step, self.loss, self.train_op], feed_dict)
            return global_step, loss
        else:
            lengths, logits = sess.run([self.lengths, self.logits], feed_dict)
            return lengths, logits

    def decode(self, logits, lengths, matrix):
        """
        :param logits: [batch_size, num_steps, num_tags]float32, logits
        :param lengths: [batch_size]int32, real length of each sequence
        :param matrix: transaction matrix for inference
        :return:
        """
        # inference final labels usa viterbi Algorithm
        paths = []
        small = -1000.0
        start = np.asarray([[small] * self.num_tags + [0]])
        for score, length in zip(logits, lengths):
            score = score[:length]
            pad = small * np.ones([length, 1])
            logits = np.concatenate([score, pad], axis=1)
            logits = np.concatenate([start, logits], axis=0)
            path, _ = viterbi_decode(logits, matrix)

            paths.append(path[1:])
        return paths

    def evaluate(self, sess, data_manager):
        results = []
        trans = self.trans.eval()
        for batch in data_manager.iter_batch():
            strings = batch[0]
            tags = batch[-1]
            lengths, scores = self.run_step(sess, False, batch)
            batch_paths = self.decode(scores, lengths, trans)
            for i in range(len(strings)):
                result = []
                string = strings[i][:lengths[i]]
                gold = [[int(x)] for x in tags[i][:lengths[i]]]
                pred = [[int(x)] for x in batch_paths[i][:lengths[i]]]
                for char, gold, pred in zip(string, gold, pred):
                    result.append([char, gold, pred])
                results.append(result)
        return results