Esempio n. 1
0
def char_index(p_sentences, h_sentences):
    word2idx, idx2word = load_char_vocab()

    p_list, h_list = [], []
    for p_sentence, h_sentence in zip(p_sentences, h_sentences):
        p = [word2idx[word.lower()] for word in p_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()]
        h = [word2idx[word.lower()] for word in h_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()]

        p_list.append(p)
        h_list.append(h)

    p_list = pad_sequences(p_list, maxlen=ESIMConfig().maxlen)
    h_list = pad_sequences(h_list, maxlen=ESIMConfig().maxlen)

    return p_list, h_list
Esempio n. 2
0
def word_index(p_sentences, h_sentences):
    word2idx, idx2word = load_word_vocab()

    p_list, h_list = [], []
    for p_sentence, h_sentence in zip(p_sentences, h_sentences):
        p = [word2idx[word.lower()] for word in p_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()]
        h = [word2idx[word.lower()] for word in h_sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()]

        p_list.append(p)
        h_list.append(h)

    p_list = pad_sequences(p_list, maxlen=args.seq_length)
    h_list = pad_sequences(h_list, maxlen=args.seq_length)

    return p_list, h_list
Esempio n. 3
0
def char_index_single(sentences):
    word2idx, idx2word = load_char_vocab()
    p_list = []
    for sentence in sentences:
        p = [word2idx[word.lower()] for word in sentence if len(word.strip()) > 0 and word.lower() in word2idx.keys()]
        p_list.append(p)
    p_list = pad_sequences(p_list, maxlen=ESIMConfig().maxlen)
    return p_list
Esempio n. 4
0
def main(config, eval_folder):

    # local the vocab file

    text_words_vocab = load_vocab(config.text_words_path)
    text_chars_vocab = load_vocab(config.text_chars_path)
    inv_text_vocab = {v: k for k, v in text_words_vocab.items()}

    # get the processing function
    processing_word = get_processing_word(text_words_vocab,
                                          text_chars_vocab,
                                          lowercase=True,
                                          chars=True)

    #load features:
    word_features = get_trimmed_features(config.word_embeddings_trimmed_path)

    examples = read_examples(eval_folder, processing_word)

    # build WImpModel

    model = WImpModel(config, word_features, None, text_words_vocab["$UNK$"],
                      inv_text_vocab, None)
    model.build_graph()

    words, word_feats, speech_interval_feats = [], [], []
    for sent_key in examples.keys():
        words_, word_feats_, speech_feats_ = zip(*examples[sent_key])
        word_feats_ = list(zip(*word_feats_))

        word_feats.append(word_feats_)
        speech_interval_feats.append(speech_feats_)
        words.append(words_)

    speech_interval_feats_pad_, speech_lengths = pad_sequences(
        speech_interval_feats,
        pad_tok=[0] * config.speech_features_dim,
        nlevels=2)
    speech_feats = speech_interval_feats_pad_[:, :, :, config.
                                              speech_lexical_features_dim:]
    speech_lexical_feats = speech_interval_feats_pad_[:, :, 0, :config.
                                                      speech_lexical_features_dim]

    feed, sequence_lengths = model.get_feed_dict(words=word_feats, dropout=1.0)
    feed[model.speech_features] = speech_feats
    feed[model.speech_lexical_features] = speech_lexical_feats
    feed[model.speech_lengths] = speech_lengths

    predictions = model.test(feed)

    print("\n")
    print("WORD IMPORTANCE PREDICTION OUTPUT")
    print("=================================")
    for sent_id in range(len(words)):
        scores = predictions[0][:sequence_lengths[sent_id]]
        tokens = words[sent_id]
        result = ["%s (%f)" % (w, s) for w, s in zip(tokens, scores)]
        print("--> " + " ".join(result) + "\n")
Esempio n. 5
0
    def get_feed_dict(self,
                      words,
                      speech=None,
                      labels=None,
                      lr=None,
                      dropout=None):
        feed = {}
        if speech is not None:
            speech_interval_feats = get_features(speech,
                                                 self.config.speech_features)
            speech_interval_feats_pad_, speech_lengths = pad_sequences(
                speech_interval_feats,
                pad_tok=[0] * self.config.speech_features_dim,
                nlevels=2)
            speech_feats = speech_interval_feats_pad_[:, :, :, self.config.
                                                      speech_lexical_features_dim:]
            speech_lexical_feats = speech_interval_feats_pad_[:, :,
                                                              0, :self.config.
                                                              speech_lexical_features_dim]

            feed[self.speech_features] = speech_feats
            feed[self.speech_lexical_features] = speech_lexical_feats
            feed[self.speech_lengths] = speech_lengths

        char_ids, word_ids = list(zip(*words))
        word_ids, sequence_lengths = pad_sequences(word_ids, 0)
        char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2)

        feed[self.word_ids] = word_ids
        feed[self.char_ids] = char_ids
        feed[self.sequence_lengths] = sequence_lengths
        feed[self.word_lengths] = word_lengths

        if labels is not None:
            labels, _ = pad_sequences(labels, 0)
            feed[self.labels] = labels

        if lr is not None:
            feed[self.lr] = lr

        if dropout is not None:
            feed[self.dropout] = dropout

        return feed, sequence_lengths
Esempio n. 6
0
def seq_index(p_sentences):
    word2idx, idx2word = load_char_vocab()

    p_list, h_list = [], []
    for p_sentence in p_sentences:
        p = [
            word2idx[word.lower()] for word in p_sentence
            if len(word.strip()) > 0 and word.lower() in word2idx.keys()
        ]

        p_list.append(p)

    p_list = pad_sequences(p_list, maxlen=15)

    return p_list
Esempio n. 7
0
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1 == 1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary()
        vocab_size = len(vocabulary_word2index)
        vocabulary_word2index_label, _ = create_voabulary_label()
        train, test, _ = load_data(vocabulary_word2index,
                                   vocabulary_word2index_label,
                                   data_type='train')
        trainX, trainY = train
        testX, testY = test
        print("testX.shape:", np.array(testX).shape)  # 2500个list.每个list代表一句话
        print("testY.shape:", np.array(testY).shape)  # 2500个label
        print("testX[0]:",
              testX[0])  # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
        print("testX[1]:", testX[1])
        print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0

        # 2.Data preprocessing
        # Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len,
                               value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                              value=0.)  # padding to max length
        ###############################################################################################
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        ###############################################################################################
    print("testX[0]:", testX[0])
    print("testX[1]:", testX[1])
    #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
    # Converting labels to binary vectors
    print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0
    print("end padding & transform to one hot...")
    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate,
                             FLAGS.batch_size, FLAGS.decay_steps,
                             FLAGS.decay_rate, FLAGS.num_sampled,
                             FLAGS.sentence_len, vocab_size, FLAGS.embed_size,
                             FLAGS.is_training)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, fast_text)

        curr_epoch = sess.run(fast_text.epoch_step)
        #3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch,
                           FLAGS.num_epochs):  #range(start,stop,step_size)
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                    print("trainY[start:end]:", trainY[start:end])
                curr_loss, curr_acc, _ = sess.run(
                    [
                        fast_text.loss_val, fast_text.accuracy,
                        fast_text.train_op
                    ],
                    feed_dict={
                        fast_text.sentence: trainX[start:end],
                        fast_text.labels: trainY[start:end]
                    })
                loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1
                if counter % 500 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter)))

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, fast_text, testX, testY,
                                              batch_size)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))

                #save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(
                    sess, save_path,
                    global_step=fast_text.epoch_step)  #fast_text.epoch_step

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, fast_text, testX, testY,
                                      batch_size)
    pass
Esempio n. 8
0
def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary()
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
    )
    questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
    test = load_data_predict(vocabulary_word2index,
                             vocabulary_word2index_label,
                             questionid_question_lists)
    testX = []
    question_id_list = []
    for tuple in test:
        question_id, question_string_list = tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)

    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                           value=0.)  # padding to max length
    print("end padding...")

    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate,
                             FLAGS.batch_size, FLAGS.decay_steps,
                             FLAGS.decay_rate, FLAGS.num_sampled,
                             FLAGS.sentence_len, vocab_size, FLAGS.embed_size,
                             FLAGS.is_training)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data = len(testX2)
        print("number_of_training_data:", number_of_training_data)
        batch_size = 1
        index = 0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a',
                                            'utf8')
        for start, end in zip(
                range(0, number_of_training_data, batch_size),
                range(batch_size, number_of_training_data + 1, batch_size)):
            logits = sess.run(
                fast_text.logits,
                feed_dict={fast_text.sentence:
                           testX2[start:end]})  #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels = get_label_using_logits(
                logits[0], vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],
                                          predicted_labels,
                                          predict_target_file_f)
            index = index + 1
        predict_target_file_f.close()