Ejemplo n.º 1
0
def do_rnn(x,y):
    global max_document_length
    print "RNN"
    trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0)
    y_test=testY

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=0.1, show_metric=True,
              batch_size=10,run_id="webshell",n_epoch=5)

    y_predict_list=model.predict(testX)
    y_predict=[]
    for i in y_predict_list:
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    do_metrics(y_test, y_predict)
Ejemplo n.º 2
0
def do_rnn(trainX, testX, trainY, testY):
    max_document_length=64
    y_test=testY
    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=64)
    net = tflearn.lstm(net, 64, dropout=0.1)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0,tensorboard_dir="dga_log")
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=10,run_id="dga",n_epoch=1)

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    print(classification_report(y_test, y_predict))
    print metrics.confusion_matrix(y_test, y_predict)
Ejemplo n.º 3
0
def  do_cnn(trainX, trainY,testX, testY):
    global n_words
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input')
    network = tflearn.embedding(network, input_dim=n_words+1, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.5)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')
    # Training
    model = tflearn.DNN(network, tensorboard_verbose=0)
    model.fit(trainX, trainY, n_epoch = 20, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
Ejemplo n.º 4
0
def pad_sentences_qr(query, response, q_max_len, r_max_len, index):
	train_query = pad_sequences(query, maxlen=q_max_len, value=index)
	train_response = pad_sequences(response, maxlen=r_max_len, value=index)
	train_query = np.array(train_query)
	train_response = np.array(train_response)
	train_query_response = np.append(train_query, train_response, axis=1)
	return train_query, train_query_response, train_response, q_max_len, r_max_len, index+1
Ejemplo n.º 5
0
def do_rnn(trainX, testX, trainY, testY):
    global n_words
    # Data preprocessing
    # Sequence padding
    print "GET n_words embedding %d" % n_words


    trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, MAX_DOCUMENT_LENGTH])
    net = tflearn.embedding(net, input_dim=n_words, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training



    model = tflearn.DNN(net, tensorboard_verbose=3)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
             batch_size=32,run_id="maidou")
Ejemplo n.º 6
0
def pad_SentencesQR(query, response):
	q_max_len, r_max_len, query_word, response_word, index = fenci(query, response)
	print("query max length:{}, response max length:{}".format(q_max_len, r_max_len))
	train_query = pad_sequences(query_word, maxlen=q_max_len, value=index)
	train_response = pad_sequences(response_word, maxlen=r_max_len, value=index)
	# print train_query[0]
	# print train_response[0]
	train_query = np.array(train_query)
	train_response = np.array(train_response)
	train_query_response = np.append(train_query, train_response, axis=1)
	return train_query, train_query_response, train_response, q_max_len, r_max_len, index
Ejemplo n.º 7
0
def do_cnn(x,y):
    global max_document_length
    print "CNN and tf"
    trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0)
    y_test=testY

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Building convolutional network
    network = input_data(shape=[None,max_document_length], name='input')
    network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
    branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
    branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
    branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
    network = merge([branch1, branch2, branch3], mode='concat', axis=1)
    network = tf.expand_dims(network, 2)
    network = global_max_pool(network)
    network = dropout(network, 0.8)
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy', name='target')

    model = tflearn.DNN(network, tensorboard_verbose=0)
    #if not os.path.exists(pkl_file):
        # Training
    model.fit(trainX, trainY,
                  n_epoch=5, shuffle=True, validation_set=0.1,
                  show_metric=True, batch_size=100,run_id="webshell")
    #    model.save(pkl_file)
    #else:
    #    model.load(pkl_file)

    y_predict_list=model.predict(testX)
    #y_predict = list(model.predict(testX,as_iterable=True))

    y_predict=[]
    for i in y_predict_list:
        print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)
    print 'y_predict_list:'
    print y_predict_list
    print 'y_predict:'
    print  y_predict
    #print  y_test

    do_metrics(y_test, y_predict)
Ejemplo n.º 8
0
def do_rnn(trainX, testX, trainY, testY):
    global max_sequences_len
    global max_sys_call
    # Data preprocessing
    # Sequence padding

    trainX = pad_sequences(trainX, maxlen=max_sequences_len, value=0.)
    testX = pad_sequences(testX, maxlen=max_sequences_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY_old=testY
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    print "GET max_sequences_len embedding %d" % max_sequences_len
    print "GET max_sys_call embedding %d" % max_sys_call

    net = tflearn.input_data([None, max_sequences_len])
    net = tflearn.embedding(net, input_dim=max_sys_call+1, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.3)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,
                             loss='categorical_crossentropy')

    # Training



    model = tflearn.DNN(net, tensorboard_verbose=3)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
             batch_size=32,run_id="maidou")

    y_predict_list = model.predict(testX)
    #print y_predict_list

    y_predict = []
    for i in y_predict_list:
        #print  i[0]
        if i[0] > 0.5:
            y_predict.append(0)
        else:
            y_predict.append(1)

    #y_predict=to_categorical(y_predict, nb_classes=2)

    print(classification_report(testY_old, y_predict))
    print metrics.confusion_matrix(testY_old, y_predict)
Ejemplo n.º 9
0
def bi_lstm(trainX, trainY,testX, testY):
    trainX = pad_sequences(trainX, maxlen=200, value=0.)
    testX = pad_sequences(testX, maxlen=200, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data(shape=[None, 200])
    net = tflearn.embedding(net, input_dim=20000, output_dim=128)
    net = tflearn.bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128))
    net = tflearn.dropout(net, 0.5)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2)
    model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64,run_id="rnn-bilstm")
Ejemplo n.º 10
0
def lstm(trainX, trainY,testX, testY):
    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=100, value=0.)
    testX = pad_sequences(testX, maxlen=100, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, 100])
    net = tflearn.embedding(net, input_dim=10000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=32,run_id="rnn-lstm")
def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer_classification")  # simple='simple'
    vocab_size = len(vocabulary_word2index)
    print("transformer_classification.vocab_size:", vocab_size)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="transformer_classification")
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
    print("list of total questions:",len(questionid_question_lists))
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
    print("list of total questions2:",len(test))
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
    print("list of total questions3:", len(testX2))
    print("end padding...")
   # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                 vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,l2_lambda=FLAGS.l2_lambda)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
            logits=sess.run(model.logits,feed_dict={model.input_x:testX2[start:end],model.dropout_keep_prob:1}) #logits:[batch_size,self.num_classes]

            question_id_sublist=question_id_list[start:end]
            get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f)

            # 6. get lable using logtis
            #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            #print(index," ;predicted_labels:",predicted_labels)
            # 7. write question id and labels to file system.
            #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index=index+1
        predict_target_file_f.close()
Ejemplo n.º 12
0
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0):

    # IMDB Dataset loading
    train, test, _ = imdb.load_data(
        path=file_path,
        n_words=vocab_size,
        valid_portion=val_fraction,
        sort_by_len=False)
    trainX, trainY = train
    testX, testY = test

    # Data preprocessing
    # Sequence padding
    trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.)
    testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    train_dataset = DataSet(trainX, trainY)

    return train_dataset
Ejemplo n.º 13
0
def do_rnn_wordbag(trainX, testX, trainY, testY):
    global max_document_length
    print "RNN and wordbag"

    trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.)
    testX = pad_sequences(testX, maxlen=max_document_length, value=0.)
    # Converting labels to binary vectors
    trainY = to_categorical(trainY, nb_classes=2)
    testY = to_categorical(testY, nb_classes=2)

    # Network building
    net = tflearn.input_data([None, max_document_length])
    net = tflearn.embedding(net, input_dim=10240000, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                             loss='categorical_crossentropy')

    # Training
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
              batch_size=10,run_id="review",n_epoch=5)
Ejemplo n.º 14
0
def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network")
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network")
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
    print("end padding...")
   # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                                     FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass,
                                     use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint of EntityNet.")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
            logits=sess.run(model.logits,feed_dict={model.query:testX2[start:end],model.story: np.expand_dims(testX2[start:end],axis=1),
                                                    model.dropout_keep_prob:1.0}) #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            #predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            #write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            question_id_sublist=question_id_list[start:end]
            get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f)

            index=index+1
        predict_target_file_f.close()
def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary()
    vocab_size = len(vocabulary_word2index)
    print("vocab_size:",vocab_size)
    #iii=0
    #iii/0
    vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label()
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file) #TODO
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists) #TODO
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)

    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
    print("end padding...")

    # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        batch_size=1
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data+1, batch_size)):
            logits=sess.run(fast_text.logits,feed_dict={fast_text.sentence:testX2[start:end]}) #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels=get_label_using_logits(logits[0],vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index=index+1
        predict_target_file_f.close()
def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="seq2seq_attention")  # simple='simple'
    vocab_size = len(vocabulary_word2index)
    print("seq2seq_attention.vocab_size:", vocab_size)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="seq2seq_attention",use_seq2seq=True)
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
    print("end padding...")
   # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        model=seq2seq_attention_model(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                                      vocab_size, FLAGS.embed_size,FLAGS.hidden_size, FLAGS.is_training,decoder_sent_length=FLAGS.decoder_sent_length,l2_lambda=FLAGS.l2_lambda)
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        decoder_input=np.reshape(np.array([vocabulary_word2index_label[_GO]]+[vocabulary_word2index_label[_PAD]]*(FLAGS.decoder_sent_length-1)),[-1,FLAGS.decoder_sent_length])
        for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
            predictions,logits=sess.run([model.predictions,model.logits],feed_dict={model.input_x:testX2[start:end],model.decoder_input:decoder_input,model.dropout_keep_prob:1}) #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels=get_label_using_logits(logits[0],predictions,vocabulary_index2word_label,vocabulary_word2index_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],predicted_labels,predict_target_file_f)
            index=index+1
        predict_target_file_f.close()
Ejemplo n.º 17
0
 def create_training_data(self):
     X = []
     y = []
     for k, v in self._sentences.items():
         for sentence in v:
             word_ids = np.zeros(self._max_document_size, np.int64)
             for idx, token in enumerate(sentence):
                 if idx >= self._max_document_size:
                     break
                 word_id = self._word_index.get(token)
                 if word_id is None:
                     word_ids[idx] = 0
                 else:
                     word_ids[idx] = word_id
             X.append(word_ids)
             labels = self._labels_to_nums(k)
             y.append(labels)
     X = pad_sequences(X, maxlen=self._max_document_size, value=0.)
     y = [np.array(label) for label in y]
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=self._test_split, random_state=42)
     return X_train, X_test, y_train, y_test
Ejemplo n.º 18
0
def test():
    v2i, _ = build_vocab()
    _, i2l = build_label()
    origin_questions = ['今天 天气 不错', '介绍 贵金属 产品']
    questions = [q.split() for q in origin_questions]
    questions = [[v2i[vocab] for vocab in ques if vocab in v2i] for ques in questions]

    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(checkpoint_path + model_name)
        saver.restore(sess, tf.train.latest_checkpoint(checkpoint_path))

        model = tf.get_default_graph()
        x = model.get_tensor_by_name("x:0")
        predict = model.get_tensor_by_name("predictions:0")

        questions = pad_sequences(questions, maxlen=x.shape[1], value=0)
        feed_dict = {x: questions}

        p = sess.run([predict], feed_dict=feed_dict)
        p = p[0].tolist()
    for index in range(len(questions)):
        print(f'{origin_questions[index]} is_business: {i2l[p[index]]}')
Ejemplo n.º 19
0
def predictThis(model, sentence):
    ignore_words = ['?']
    pattern_words = nltk.word_tokenize(sentence)
    # stem each word
    pattern_words = [
        stemmer.stem(word.lower()) for word in pattern_words
        if word not in ignore_words
    ]
    encoded_sentence = []
    for w in pattern_words:
        if w in words:
            #print(w, ' : ', words.index(w))
            encoded_sentence.append(words.index(w))

    #print(encoded_sentence)
    samples = []
    samples.append(encoded_sentence)

    samples = pad_sequences(samples, maxlen=netprops.x_width, value=0.)
    preds = model.predict(samples)
    index, value = max(enumerate(preds[0]), key=operator.itemgetter(1))
    print(sentence, ' : ', classes[index], ' : ', (value * 100), '%')
def get_data():
    black_x, cdn_x, white_x = get_local_data()
    black_y, cdn_y, white_y = [LABEL.black] * len(black_x), [
        LABEL.cdn
    ] * len(cdn_x), [LABEL.white] * len(white_x)

    X = black_x + cdn_x + white_x
    labels = black_y + cdn_y + white_y

    # Generate a dictionary of valid characters
    valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X)))}

    max_features = len(valid_chars) + 1
    print "max_features:", max_features
    maxlen = np.max([len(x) for x in X])
    print "max_len:", maxlen
    maxlen = min(maxlen, 256)

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = pad_sequences(X, maxlen=maxlen, value=0.)

    # Convert labels to 0-1
    Y = to_categorical(labels, nb_classes=3)

    volcab_file = "volcab.pkl"
    output = open(volcab_file, 'wb')
    # Pickle dictionary using protocol 0.
    data = {
        "valid_chars": valid_chars,
        "max_len": maxlen,
        "volcab_size": max_features
    }
    pickle.dump(data, output)
    output.close()

    return X, Y, maxlen, max_features
Ejemplo n.º 21
0
def train(trainX, trainY, model_file):
    # Data preprocessing
    trainX = pad_sequences(trainX, maxlen=charvec_len, value=0.)
    trainY = to_categorical(trainY, nb_classes=2)

    net = bi_LSTM()

    # Training
    '''
    tensorboard_verbose:
    0: Loss, Accuracy (Best Speed)
    1: Loss, Accuracy + Gradients
    2: Loss, Accuracy, Gradients, Weights
    3: Loss, Accuracy, Gradients, Weights, Activations, Sparsity (Best Visualization)
    '''
    model = tflearn.DNN(net,
                        clip_gradients=0.,
                        tensorboard_verbose=0,
                        checkpoint_path='./chkpoint/',
                        best_checkpoint_path='./best_chkpoint/',
                        best_val_accuracy=0.9)

    # show_metric: If True, accuracy will be calculated and displayed
    #              at every step. Might give slower training.
    model.fit(trainX,
              trainY,
              validation_set=0.1,
              show_metric=False,
              batch_size=128,
              n_epoch=1,
              run_id='bLSTM_i{}_{}k_d{}_o{}_d{}_adam_l{}_b{}'.format(
                  charvec_len, in_dim // 1000, int(drop1 * 10), nn_dim,
                  int(drop2 * 10),
                  str(lrate).split('.')[1], nn_dim))

    # Save model
    model.save(model_file)
Ejemplo n.º 22
0
def Example_P(article, entity, vocab, hps):

    # get ids of special tokens
    pad_id = vocab.word2id(PAD_TOKEN)
    """process the article"""
    # create vocab and word 2 id
    article_value = value2ids(article, vocab, hps.document_length)
    # word 2 id
    article_words = article2ids(article, vocab)
    # num sentence
    article_len = len(article)
    # word level padding
    article_words = pad_sequences(article_words,
                                  maxlen=hps.sequence_length,
                                  value=pad_id)
    # sentence level padding
    pad_article = np.expand_dims(np.zeros(hps.sequence_length, dtype=np.int32),
                                 axis=0)
    if article_words.shape[0] > hps.max_num_sequence:
        article_words = article_words[:hps.max_num_sequence]
    while article_words.shape[0] < hps.max_num_sequence:
        article_words = np.concatenate((article_words, pad_article))

    return article_value, article_words, article_len
Ejemplo n.º 23
0
def train(trainX, trainY, model_file):
    print('# Data preprocessing')
    trainX = pad_sequences(trainX, maxlen=440, value=0.)
    trainY = to_categorical(trainY, nb_classes=2)

    print('build network')
    net = bi_LSTM()

    print('# Training')
    '''
    tensorboard_verbose:
    0: Loss, Accuracy (Best Speed)
    1: Loss, Accuracy + Gradients
    2: Loss, Accuracy, Gradients, Weights
    3: Loss, Accuracy, Gradients, Weights, Activations, Sparsity (Best Visualization)
    '''
    model = tflearn.DNN(net,
                        clip_gradients=0.,
                        tensorboard_verbose=0,
                        checkpoint_path='./chkpoint_mdm001/',
                        best_checkpoint_path='./best_chkpoint_mdm001/',
                        best_val_accuracy=0.9)
    print('tfl.DNN end.')

    model.fit(trainX,
              trainY,
              validation_set=0.1,
              show_metric=True,
              batch_size=128,
              n_epoch=4,
              run_id='bilstm_170519b')
    print('model.fit end.')

    # Save model
    model.save(model_file)
    print('model save end.')
Ejemplo n.º 24
0
def load_data_multilabel(traning_data_path,
                         vocab_word2index,
                         vocab_label2index,
                         sentence_len,
                         training_portion=0.95):
    """
    convert data as indexes using word2index dicts.
    :param traning_data_path:
    :param vocab_word2index:
    :param vocab_label2index:
    :return:
    """
    file_object = codecs.open(traning_data_path, mode='r', encoding='utf-8')
    lines = file_object.readlines()
    random.shuffle(lines)
    label_size = len(vocab_label2index)
    X = []
    Y = []
    for i, line in enumerate(lines):
        raw_list = line.strip().split("__label__")
        input_list = raw_list[0].strip().split(" ")
        x = [vocab_word2index.get(x, UNK_ID) for x in input_list if x != '']
        label_list = raw_list[1:]
        label_list = [vocab_label2index[label] for label in label_list]
        y = transform_multilabel_as_multihot(label_list, label_size)
        X.append(x)
        Y.append(y)
    X = pad_sequences(X, maxlen=sentence_len,
                      value=0.)  # padding to max length
    number_examples = len(lines)
    training_number = int(training_portion * number_examples)
    train = (X[0:training_number], Y[0:training_number])
    valid_number = min(1000, number_examples - training_number)
    test = (X[training_number + 1:training_number + valid_number + 1],
            Y[training_number + 1:training_number + valid_number + 1])
    return train, test
def machine_learning(comments):
    # 划分样本集和标签集
    X = comments["content"]
    y = comments["quality"]

    # 分类训练集和测试集
    random_state = 42
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=random_state)

    # 将样本集的字符串转变为数字序列。创建vocab,把X转化为X_word_ids
    vect = CountVectorizer(ngram_range=(1, 1), token_pattern=r'\b\w{1,}\b')
    vect.fit(X_train)
    vocab = vect.vocabulary_

    def convert_X_to_X_word_ids(X):
        return X.apply(lambda x: [
            vocab[w] for w in [w.lower().strip() for w in x.split()]
            if w in vocab
        ])

    X_train_word_ids = convert_X_to_X_word_ids(X_train)
    X_test_word_ids = convert_X_to_X_word_ids(X_test)

    # 序列扩充
    X_test_padded_seqs = pad_sequences(X_test_word_ids, maxlen=20, value=0)
    X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=20, value=0)

    # 标签集处理
    unique_y_labels = list(y_train.value_counts().index)
    le = preprocessing.LabelEncoder()
    le.fit(unique_y_labels)

    y_train = to_categorical(y_train.map(lambda x: le.transform([x])[0]),
                             nb_classes=len(unique_y_labels))
    y_test = to_categorical(y_test.map(lambda x: le.transform([x])[0]),
                            nb_classes=len(unique_y_labels))

    # 构造网络
    n_epoch = 10
    size_of_each_vector = X_train_padded_seqs.shape[1]
    vocab_size = len(vocab)
    no_of_unique_y_labels = len(unique_y_labels)

    net = tflearn.input_data([None, size_of_each_vector])
    net = tflearn.embedding(net, input_dim=vocab_size, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.6)
    net = tflearn.fully_connected(net,
                                  no_of_unique_y_labels,
                                  activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=1e-4,
                             loss='categorical_crossentropy')

    # 训练网络

    # 初始化
    model = tflearn.DNN(net,
                        tensorboard_verbose=0,
                        tensorboard_dir="./tflearn_data/tflearn_logs/")

    # 训练
    model.fit(X_train_padded_seqs,
              y_train,
              validation_set=(X_test_padded_seqs, y_test),
              n_epoch=n_epoch,
              show_metric=True,
              batch_size=100)

    # 保存
    time = datetime.now()
    time_str = str(time).replace(":", ".")
    os.makedirs(
        f"./tflearn_data/tflearn_models/{time_str}({n_epoch}, {random_state})")
    model.save(
        f"./tflearn_data/tflearn_models/{time_str}({n_epoch}, {random_state})/model"
    )
Ejemplo n.º 26
0
def comment_predict(data):
    """
	根据已有模型对评论倾向进行预测
	:return:
	"""
    # 建立模型时用到的评论数据
    predict_data = pd.read_csv("/var/www/test/python/comments_tag.csv")

    def chinese_word_cut(text):
        """
		使用结巴分词对中文进行切分转化为独立的词语
		:param text: 完整的评论
		:return: 切分后的评论
		"""
        return " ".join(jieba.cut(text))

    # 进行分词并新建一列保存结果
    predict_data["cut_comment"] = predict_data.comment.apply(chinese_word_cut)

    # 确定评论部分(X)和标签部分(y)
    X = predict_data["cut_comment"]
    y = predict_data["evaluation"]

    # 对数据集进行切分,分为训练集(train)和测试集(test)
    # 这里随机数种子要和建立模型时的随机数种子一样
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)

    def get_custom_stopwords(stop_words_file):
        """
		得到停用词表
		:param stop_words_file:
		:return: 停用词表list
		"""
        with open(stop_words_file, encoding="utf-8") as f:
            stopwords = f.read()

        stopwords_list = stopwords.split("\n")
        custom_stopwords_list = [i for i in stopwords_list]
        return custom_stopwords_list

    # 得到停用词表
    stop_words_file = "/var/www/test/python/哈工大停用词表.txt"
    stopwords = get_custom_stopwords(stop_words_file)

    # 计算特征数值
    vect = CountVectorizer(max_df=0.8,
                           min_df=3,
                           token_pattern=u'(?u)\\b\\w+\\b',
                           stop_words=frozenset(stopwords))
    vect.fit(X_train)
    vocab = vect.vocabulary_

    def convert_X_to_X_word_ids(X):
        """
		将评论(文字部分)转化为id集(数值序列)
		:param X:评论集合
		:return:数值序列
		"""
        return X.apply(lambda x: [
            vocab[w] for w in [w.lower().strip() for w in x.split()]
            if w in vocab
        ])

    # 序列扩充,统一延长到长度为20的序列,使得评论序列格式相同,不足的用0代替
    X_train_word_ids = convert_X_to_X_word_ids(X_train)
    X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=20, value=0)

    # 标签集处理
    unique_y_labels = list(y_train.value_counts().index)
    le = preprocessing.LabelEncoder()
    le.fit(unique_y_labels)

    # 构造网络
    size_of_each_vector = X_train_padded_seqs.shape[1]
    vocab_size = len(vocab)
    no_of_unique_y_labels = len(unique_y_labels)

    net = tflearn.input_data([None, size_of_each_vector])
    net = tflearn.embedding(net, input_dim=vocab_size, output_dim=128)
    net = tflearn.lstm(net, 128, dropout=0.6)
    net = tflearn.fully_connected(net,
                                  no_of_unique_y_labels,
                                  activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=1e-4,
                             loss='categorical_crossentropy')

    # 初始化
    model = tflearn.DNN(net, tensorboard_verbose=0)

    # 加载模型
    model.load(
        "/var/www/test/python/2019-07-10 20.03.06.175272(1000, 42)/model")

    # ———————————————————————————————————————预测部分———————————————————————————————————————
    # 待预测的评论数据
    predict_data = data

    # 对评论数据进行分词
    predict_data["cut_comment"] = predict_data.comment.apply(chinese_word_cut)

    # 设置预测集
    predict_X = predict_data["cut_comment"]

    # 转化为数值序列
    predict_X_word_ids = convert_X_to_X_word_ids(predict_X)
    predict_X_padded_seqs = pad_sequences(predict_X_word_ids,
                                          maxlen=20,
                                          value=0)

    # 进行预测并得到结果
    predict_Y = model.predict(predict_X_padded_seqs)

    # 输出结果
    get_evaluation(predict_Y)
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1==1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary()
        vocab_size = len(vocabulary_word2index)
        vocabulary_word2index_label,_ = create_voabulary_label()
        train, test, _ = load_data(vocabulary_word2index, vocabulary_word2index_label,data_type='train')
        trainX, trainY = train
        testX, testY = test
        print("testX.shape:", np.array(testX).shape)  # 2500个list.每个list代表一句话
        print("testY.shape:", np.array(testY).shape)  # 2500个label
        print("testX[0]:", testX[0])  # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
        print("testX[1]:", testX[1]);
        print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0

        # 2.Data preprocessing
        # Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
        ###############################################################################################
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        ###############################################################################################
    print("testX[0]:", testX[0]) ;print("testX[1]:", testX[1]); #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
    # Converting labels to binary vectors
    print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0
    print("end padding & transform to one hot...")
    #2.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
        #Initialize Save
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding: #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text)

        curr_epoch=sess.run(fast_text.epoch_step)
        #3.feed data & training
        number_of_training_data=len(trainX)
        batch_size=FLAGS.batch_size
        for epoch in range(curr_epoch,FLAGS.num_epochs):#range(start,stop,step_size)
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)):
                if epoch==0 and counter==0:
                    print("trainX[start:end]:",trainX[start:end])
                    print("trainY[start:end]:",trainY[start:end])
                curr_loss,curr_acc,_=sess.run([fast_text.loss_val,fast_text.accuracy,fast_text.train_op],feed_dict={fast_text.sentence:trainX[start:end],fast_text.labels:trainY[start:end]})
                loss,acc,counter=loss+curr_loss,acc+curr_acc,counter+1
                if counter %500==0:
                    print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter)))

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            # 4.validation
            print(epoch,FLAGS.validate_every,(epoch % FLAGS.validate_every==0))
            if epoch % FLAGS.validate_every==0:
                eval_loss, eval_acc=do_eval(sess,fast_text,testX,testY,batch_size)
                print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_acc))

                #save model to checkpoint
                save_path=FLAGS.ckpt_dir+"model.ckpt"
                saver.save(sess,save_path,global_step=fast_text.epoch_step) #fast_text.epoch_step

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, fast_text, testX, testY, batch_size)
    pass
def test_pad():
    trainX='w18476 w4454 w1674 w6 w25 w474 w1333 w1467 w863 w6 w4430 w11 w813 w4463 w863 w6 w4430 w111'
    trainX=trainX.split(" ")
    trainX = pad_sequences([[trainX]], maxlen=100, value=0.)
    print("trainX:",trainX)
Ejemplo n.º 29
0
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.embedding_ops import embedding
from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell
from tflearn.layers.estimator import regression

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=200, value=0.)
testX = pad_sequences(testX, maxlen=200, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = input_data(shape=[None, 200])
net = embedding(net, input_dim=20000, output_dim=128)
net = bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128))
net = dropout(net, 0.5)
net = fully_connected(net, 2, activation='softmax')
net = regression(net, optimizer='adam', loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2)
Ejemplo n.º 30
0
def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        simple='simple',
        word2vec_model_path=FLAGS.word2vec_model_path,
        name_scope="cnn2")
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
        name_scope="cnn2")
    questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
    test = load_data_predict(vocabulary_word2index,
                             vocabulary_word2index_label,
                             questionid_question_lists)
    testX = []
    question_id_list = []
    for tuple in test:
        question_id, question_string_list = tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                           value=0.)  # padding to max length
    print("end padding...")
    # 3.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # 4.Instantiate Model
        textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes,
                          FLAGS.learning_rate, FLAGS.batch_size,
                          FLAGS.decay_steps, FLAGS.decay_rate,
                          FLAGS.sentence_len, vocab_size, FLAGS.embed_size,
                          FLAGS.is_training)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print("Can't find the checkpoint.going to stop")
            return
        # 5.feed data, to get logits
        number_of_training_data = len(testX2)
        print("number_of_training_data:", number_of_training_data)
        index = 0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a',
                                            'utf8')
        for start, end in zip(
                range(0, number_of_training_data, FLAGS.batch_size),
                range(FLAGS.batch_size, number_of_training_data + 1,
                      FLAGS.batch_size)):
            logits = sess.run(textCNN.logits,
                              feed_dict={
                                  textCNN.input_x: testX2[start:end],
                                  textCNN.dropout_keep_prob: 1
                              })  #'shape of logits:', ( 1, 1999)
            # 6. get lable using logtis
            predicted_labels = get_label_using_logits(
                logits[0], vocabulary_index2word_label)
            # 7. write question id and labels to file system.
            write_question_id_with_labels(question_id_list[index],
                                          predicted_labels,
                                          predict_target_file_f)
            index = index + 1
        predict_target_file_f.close()
Ejemplo n.º 31
0
def main(_):
    #os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if FLAGS.dataset == "bibsonomy-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_bib
        traning_data_path = FLAGS.training_data_path_bib
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 11.59

    elif FLAGS.dataset == "zhihu-sample":
        word2vec_model_path = FLAGS.word2vec_model_path_zhihu
        traning_data_path = FLAGS.training_data_path_zhihu
        FLAGS.sequence_length = 100
        FLAGS.ave_labels_per_doc = 2.45

    elif FLAGS.dataset == "citeulike-a-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_cua
        traning_data_path = FLAGS.training_data_path_cua
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 11.6

    elif FLAGS.dataset == "citeulike-t-clean":
        word2vec_model_path = FLAGS.word2vec_model_path_cut
        traning_data_path = FLAGS.training_data_path_cut
        FLAGS.sequence_length = 300
        FLAGS.ave_labels_per_doc = 7.68

    # 1. create trainlist, validlist and testlist
    trainX, trainY, testX, testY = None, None, None, None
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        word2vec_model_path,
        name_scope=FLAGS.dataset + "-lda")  #simple='simple'
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
        voabulary_label=traning_data_path, name_scope=FLAGS.dataset + "-lda")
    num_classes = len(vocabulary_word2index_label)
    print(vocabulary_index2word_label[0], vocabulary_index2word_label[1])

    vocab_size = len(vocabulary_word2index)
    print("vocab_size:", vocab_size)

    # choosing whether to use k-fold cross-validation or hold-out validation
    if FLAGS.kfold == -1:  # hold-out
        train, valid, test = load_data_multilabel_new(
            vocabulary_word2index,
            vocabulary_word2index_label,
            keep_label_percent=FLAGS.keep_label_percent,
            valid_portion=FLAGS.valid_portion,
            test_portion=FLAGS.test_portion,
            multi_label_flag=FLAGS.multi_label_flag,
            traning_data_path=traning_data_path)
        # here train, test are tuples; turn train into trainlist.
        trainlist, validlist, testlist = list(), list(), list()
        trainlist.append(train)
        validlist.append(valid)
        testlist.append(test)
    else:  # k-fold
        trainlist, validlist, testlist = load_data_multilabel_new_k_fold(
            vocabulary_word2index,
            vocabulary_word2index_label,
            keep_label_percent=FLAGS.keep_label_percent,
            kfold=FLAGS.kfold,
            test_portion=FLAGS.test_portion,
            multi_label_flag=FLAGS.multi_label_flag,
            traning_data_path=traning_data_path)
        # here trainlist, testlist are list of tuples.
    # get and pad testing data: there is only one testing data, but kfold training and validation data
    assert len(testlist) == 1
    testX, testY = testlist[0]
    testX = pad_sequences(testX, maxlen=FLAGS.sequence_length,
                          value=0.)  # padding to max length

    # 3. transform trainlist to the format. x_train, x_test: training and test feature matrices of size (n_samples, n_features)
    #print(len(trainlist))
    #trainX,trainY = trainlist[0]
    #trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)
    #print(len(trainX))
    #print(len(trainX[0]))
    #print(trainX[0])
    #print(len(trainY))
    #print(len(trainY[0]))
    #print(trainY[0])
    #print(np.asarray(trainY).shape)

    num_runs = len(trainlist)
    #validation results variables
    valid_acc_th, valid_prec_th, valid_rec_th, valid_fmeasure_th, valid_hamming_loss_th = [
        0
    ] * num_runs, [0] * num_runs, [0] * num_runs, [0] * num_runs, [
        0
    ] * num_runs  # initialise the result lists
    final_valid_acc_th, final_valid_prec_th, final_valid_rec_th, final_valid_fmeasure_th, final_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    min_valid_acc_th, min_valid_prec_th, min_valid_rec_th, min_valid_fmeasure_th, min_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    max_valid_acc_th, max_valid_prec_th, max_valid_rec_th, max_valid_fmeasure_th, max_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    std_valid_acc_th, std_valid_prec_th, std_valid_rec_th, std_valid_fmeasure_th, std_valid_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    #testing results variables
    test_acc_th, test_prec_th, test_rec_th, test_fmeasure_th, test_hamming_loss_th = [
        0
    ] * num_runs, [0] * num_runs, [0] * num_runs, [0] * num_runs, [
        0
    ] * num_runs  # initialise the testing result lists
    final_test_acc_th, final_test_prec_th, final_test_rec_th, final_test_fmeasure_th, final_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    min_test_acc_th, min_test_prec_th, min_test_rec_th, min_test_fmeasure_th, min_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    max_test_acc_th, max_test_prec_th, max_test_rec_th, max_test_fmeasure_th, max_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    std_test_acc_th, std_test_prec_th, std_test_rec_th, std_test_fmeasure_th, std_test_hamming_loss_th = 0.0, 0.0, 0.0, 0.0, 0.0
    #output variables
    output_valid = ""
    output_test = ""
    output_csv_valid = "fold,hamming_loss,acc,prec,rec,f1"
    output_csv_test = "fold,hamming_loss,acc,prec,rec,f1"

    time_train = [0] * num_runs  # get time spent in training
    num_run = 0

    mallet_path = FLAGS.mallet_path
    num_topics = FLAGS.num_topics
    alpha = 50 / num_topics
    iterations = FLAGS.iterations
    k_num_doc = FLAGS.k_num_doc

    remove_pad_id = True
    remove_dot = True
    docs_test = generateLDAdocFromIndex(testX,
                                        vocabulary_index2word,
                                        remove_pad_id=remove_pad_id,
                                        remove_dot=remove_dot)

    for trainfold in trainlist:
        # get training and validation data
        trainX, trainY = trainfold
        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)
        # generate training data for gensim MALLET wrapper for LDA
        docs = generateLDAdocFromIndex(trainX,
                                       vocabulary_index2word,
                                       remove_pad_id=remove_pad_id,
                                       remove_dot=remove_dot)
        #print(docs[10])
        id2word = corpora.Dictionary(docs)
        corpus = [id2word.doc2bow(text) for text in docs]
        #print(corpus[10])
        # generate validation data for gensim MALLET wrapper for LDA
        validX, validY = validlist[num_run]
        validX = pad_sequences(validX, maxlen=FLAGS.sequence_length, value=0.)
        docs_valid = generateLDAdocFromIndex(validX,
                                             vocabulary_index2word,
                                             remove_pad_id=remove_pad_id,
                                             remove_dot=remove_dot)
        corpus_valid = [id2word.doc2bow(text) for text in docs_valid]
        # generate testing data for gensim MALLET wrapper for LDA
        corpus_test = [id2word.doc2bow(text) for text in docs_test]

        # training
        start_time_train = time.time()
        print('start training fold', str(num_run))

        model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=corpus,
                                                 num_topics=num_topics,
                                                 alpha=alpha,
                                                 id2word=id2word,
                                                 iterations=iterations)
        pprint(model.show_topics(formatted=False))

        print('num_run', str(num_run), 'train done.')

        time_train[num_run] = time.time() - start_time_train
        print("--- training of fold %s took %s seconds ---" %
              (num_run, time_train[num_run]))

        # represent each document as a topic vector
        #mat_train = np.array(model[corpus]) # this will cause an Error with large num_topics, e.g. 1000 or higher.
        #Thus, we turn the MALLET LDA model to a native Gensim LDA model
        model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)
        mat_train = np.array(
            model.get_document_topics(corpus, minimum_probability=0.0))
        #print(len(model[corpus[0]]))
        #print(len(model[corpus[1]]))
        #print(len(model[corpus[2]]))
        #print(mat_train.shape)
        mat_train = mat_train[:, :,
                              1]  # documents in training set as a matrix of topic probabilities

        # evaluate on training data
        #if num_run == 0 and FLAGS.kfold != -1: # do this only for the first fold in k-fold cross-validation to save time
        #    acc, prec, rec, f_measure, hamming_loss = do_eval_lda(model, k_num_doc, mat_train, trainY, corpus, trainY, vocabulary_index2word_label, hamming_q=FLAGS.ave_labels_per_doc)
        #    print('training:', acc, prec, rec, f_measure, hamming_loss)

        # validation
        valid_acc_th[num_run], valid_prec_th[num_run], valid_rec_th[
            num_run], valid_fmeasure_th[num_run], valid_hamming_loss_th[
                num_run] = do_eval_lda(model,
                                       k_num_doc,
                                       mat_train,
                                       trainY,
                                       corpus_valid,
                                       validY,
                                       vocabulary_index2word_label,
                                       hamming_q=FLAGS.ave_labels_per_doc)
        print(
            "LDA==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f"
            % (num_run, valid_acc_th[num_run], valid_hamming_loss_th[num_run],
               valid_prec_th[num_run], valid_rec_th[num_run],
               valid_fmeasure_th[num_run]))
        output_valid = output_valid + "\n" + "LDA==>Run %d Validation Accuracy: %.3f\tValidation Hamming Loss: %.3f\tValidation Precision: %.3f\tValidation Recall: %.3f\tValidation F-measure: %.3f" % (
            num_run, valid_acc_th[num_run], valid_hamming_loss_th[num_run],
            valid_prec_th[num_run], valid_rec_th[num_run], valid_fmeasure_th[
                num_run]) + "\n"  # also output the results of each run.
        output_csv_valid = output_csv_valid + "\n" + str(num_run) + "," + str(
            valid_hamming_loss_th[num_run]) + "," + str(
                valid_acc_th[num_run]) + "," + str(
                    valid_prec_th[num_run]) + "," + str(
                        valid_rec_th[num_run]) + "," + str(
                            valid_fmeasure_th[num_run])

        start_time_test = time.time()
        # evaluate on testing data
        test_acc_th[num_run], test_prec_th[num_run], test_rec_th[
            num_run], test_fmeasure_th[num_run], test_hamming_loss_th[
                num_run] = do_eval_lda(model,
                                       k_num_doc,
                                       mat_train,
                                       trainY,
                                       corpus_test,
                                       testY,
                                       vocabulary_index2word_label,
                                       hamming_q=FLAGS.ave_labels_per_doc)
        print(
            "LDA==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f"
            % (num_run, test_acc_th[num_run], test_hamming_loss_th[num_run],
               test_prec_th[num_run], test_rec_th[num_run],
               test_fmeasure_th[num_run]))
        output_test = output_test + "\n" + "LDA==>Run %d Test Accuracy: %.3f\tTest Hamming Loss: %.3f\tTest Precision: %.3f\tTest Recall: %.3f\tTest F-measure: %.3f" % (
            num_run, test_acc_th[num_run], test_hamming_loss_th[num_run],
            test_prec_th[num_run], test_rec_th[num_run], test_fmeasure_th[
                num_run]) + "\n"  # also output the results of each run.
        output_csv_test = output_csv_test + "\n" + str(num_run) + "," + str(
            test_hamming_loss_th[num_run]) + "," + str(
                test_acc_th[num_run]) + "," + str(
                    test_prec_th[num_run]) + "," + str(
                        test_rec_th[num_run]) + "," + str(
                            test_fmeasure_th[num_run])

        print("--- testing of fold %s took %s seconds ---" %
              (num_run, time.time() - start_time_test))

        prediction_str = ""
        # output final predictions for qualitative analysis
        if FLAGS.report_rand_pred == True:
            prediction_str = display_for_qualitative_evaluation(
                model,
                k_num_doc,
                mat_train,
                trainY,
                corpus_test,
                testX,
                testY,
                vocabulary_index2word,
                vocabulary_index2word_label,
                hamming_q=FLAGS.ave_labels_per_doc)
        # update the num_run
        num_run = num_run + 1

    print('\n--Final Results--\n')
    #print('C', FLAGS.C, 'gamma', FLAGS.gamma)

    # report min, max, std, average for the validation results
    min_valid_acc_th = min(valid_acc_th)
    min_valid_prec_th = min(valid_prec_th)
    min_valid_rec_th = min(valid_rec_th)
    min_valid_fmeasure_th = min(valid_fmeasure_th)
    min_valid_hamming_loss_th = min(valid_hamming_loss_th)

    max_valid_acc_th = max(valid_acc_th)
    max_valid_prec_th = max(valid_prec_th)
    max_valid_rec_th = max(valid_rec_th)
    max_valid_fmeasure_th = max(valid_fmeasure_th)
    max_valid_hamming_loss_th = max(valid_hamming_loss_th)

    if FLAGS.kfold != -1:
        std_valid_acc_th = statistics.stdev(valid_acc_th)  # to change
        std_valid_prec_th = statistics.stdev(valid_prec_th)
        std_valid_rec_th = statistics.stdev(valid_rec_th)
        std_valid_fmeasure_th = statistics.stdev(valid_fmeasure_th)
        std_valid_hamming_loss_th = statistics.stdev(valid_hamming_loss_th)

    final_valid_acc_th = sum(valid_acc_th) / num_runs
    final_valid_prec_th = sum(valid_prec_th) / num_runs
    final_valid_rec_th = sum(valid_rec_th) / num_runs
    final_valid_fmeasure_th = sum(valid_fmeasure_th) / num_runs
    final_valid_hamming_loss_th = sum(valid_hamming_loss_th) / num_runs

    print(
        "LDA==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)"
        % (final_valid_acc_th, std_valid_acc_th, min_valid_acc_th,
           max_valid_acc_th, final_valid_hamming_loss_th,
           std_valid_hamming_loss_th, min_valid_hamming_loss_th,
           max_valid_hamming_loss_th, final_valid_prec_th, std_valid_prec_th,
           min_valid_prec_th, max_valid_prec_th, final_valid_rec_th,
           std_valid_rec_th, min_valid_rec_th, max_valid_rec_th,
           final_valid_fmeasure_th, std_valid_fmeasure_th,
           min_valid_fmeasure_th, max_valid_fmeasure_th))
    #output the result to a file
    output_valid = output_valid + "\n" + "LDA==>Final Validation results Validation Accuracy: %.3f ± %.3f (%.3f - %.3f)\tValidation Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tValidation Precision: %.3f ± %.3f (%.3f - %.3f)\tValidation Recall: %.3f ± %.3f (%.3f - %.3f)\tValidation F-measure: %.3f ± %.3f (%.3f - %.3f)" % (
        final_valid_acc_th, std_valid_acc_th, min_valid_acc_th,
        max_valid_acc_th, final_valid_hamming_loss_th,
        std_valid_hamming_loss_th, min_valid_hamming_loss_th,
        max_valid_hamming_loss_th, final_valid_prec_th, std_valid_prec_th,
        min_valid_prec_th, max_valid_prec_th, final_valid_rec_th,
        std_valid_rec_th, min_valid_rec_th, max_valid_rec_th,
        final_valid_fmeasure_th, std_valid_fmeasure_th, min_valid_fmeasure_th,
        max_valid_fmeasure_th) + "\n"
    output_csv_valid = output_csv_valid + "\n" + "average" + "," + str(
        round(final_valid_hamming_loss_th, 3)) + "±" + str(
            round(std_valid_hamming_loss_th, 3)
        ) + "," + str(round(final_valid_acc_th, 3)) + "±" + str(
            round(std_valid_acc_th, 3)) + "," + str(
                round(final_valid_prec_th, 3)) + "±" + str(
                    round(std_valid_prec_th, 3)) + "," + str(
                        round(final_valid_rec_th, 3)) + "±" + str(
                            round(std_valid_rec_th, 3)) + "," + str(
                                round(final_valid_fmeasure_th, 3)) + "±" + str(
                                    round(std_valid_fmeasure_th, 3))

    # report min, max, std, average for the testing results
    min_test_acc_th = min(test_acc_th)
    min_test_prec_th = min(test_prec_th)
    min_test_rec_th = min(test_rec_th)
    min_test_fmeasure_th = min(test_fmeasure_th)
    min_test_hamming_loss_th = min(test_hamming_loss_th)

    max_test_acc_th = max(test_acc_th)
    max_test_prec_th = max(test_prec_th)
    max_test_rec_th = max(test_rec_th)
    max_test_fmeasure_th = max(test_fmeasure_th)
    max_test_hamming_loss_th = max(test_hamming_loss_th)

    if FLAGS.kfold != -1:
        std_test_acc_th = statistics.stdev(test_acc_th)  # to change
        std_test_prec_th = statistics.stdev(test_prec_th)
        std_test_rec_th = statistics.stdev(test_rec_th)
        std_test_fmeasure_th = statistics.stdev(test_fmeasure_th)
        std_test_hamming_loss_th = statistics.stdev(test_hamming_loss_th)

    final_test_acc_th = sum(test_acc_th) / num_runs
    final_test_prec_th = sum(test_prec_th) / num_runs
    final_test_rec_th = sum(test_rec_th) / num_runs
    final_test_fmeasure_th = sum(test_fmeasure_th) / num_runs
    final_test_hamming_loss_th = sum(test_hamming_loss_th) / num_runs

    print(
        "LDA==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)"
        %
        (final_test_acc_th, std_test_acc_th, min_test_acc_th, max_test_acc_th,
         final_test_hamming_loss_th, std_test_hamming_loss_th,
         min_test_hamming_loss_th, max_test_hamming_loss_th,
         final_test_prec_th, std_test_prec_th, min_test_prec_th,
         max_test_prec_th, final_test_rec_th, std_test_rec_th, min_test_rec_th,
         max_test_rec_th, final_test_fmeasure_th, std_test_fmeasure_th,
         min_test_fmeasure_th, max_test_fmeasure_th))
    #output the result to a file
    output_test = output_test + "\n" + "LDA==>Final Test results Test Accuracy: %.3f ± %.3f (%.3f - %.3f)\tTest Hamming Loss: %.3f ± %.3f (%.3f - %.3f)\tTest Precision: %.3f ± %.3f (%.3f - %.3f)\tTest Recall: %.3f ± %.3f (%.3f - %.3f)\tTest F-measure: %.3f ± %.3f (%.3f - %.3f)" % (
        final_test_acc_th, std_test_acc_th, min_test_acc_th, max_test_acc_th,
        final_test_hamming_loss_th, std_test_hamming_loss_th,
        min_test_hamming_loss_th, max_test_hamming_loss_th, final_test_prec_th,
        std_test_prec_th, min_test_prec_th, max_test_prec_th,
        final_test_rec_th, std_test_rec_th, min_test_rec_th, max_test_rec_th,
        final_test_fmeasure_th, std_test_fmeasure_th, min_test_fmeasure_th,
        max_test_fmeasure_th) + "\n"
    output_csv_test = output_csv_test + "\n" + "average" + "," + str(
        round(final_test_hamming_loss_th, 3)) + "±" + str(
            round(std_test_hamming_loss_th, 3)) + "," + str(
                round(final_test_acc_th, 3)
            ) + "±" + str(round(std_test_acc_th, 3)) + "," + str(
                round(final_test_prec_th, 3)) + "±" + str(
                    round(std_test_prec_th, 3)) + "," + str(
                        round(final_test_rec_th, 3)) + "±" + str(
                            round(std_test_rec_th, 3)) + "," + str(
                                round(final_test_fmeasure_th, 3)) + "±" + str(
                                    round(std_test_fmeasure_th, 3))

    setting = "dataset:" + str(FLAGS.dataset) + "\nT: " + str(
        FLAGS.num_topics) + "\nk: " + str(FLAGS.k_num_doc) + ' \ni: ' + str(
            FLAGS.iterations)
    print("--- The whole program took %s seconds ---" %
          (time.time() - start_time))
    time_used = "--- The whole program took %s seconds ---" % (time.time() -
                                                               start_time)
    if FLAGS.kfold != -1:
        print("--- The average training took %s ± %s seconds ---" %
              (sum(time_train) / num_runs, statistics.stdev(time_train)))
        average_time_train = "--- The average training took %s ± %s seconds ---" % (
            sum(time_train) / num_runs, statistics.stdev(time_train))
    else:
        print("--- The average training took %s ± %s seconds ---" %
              (sum(time_train) / num_runs, 0))
        average_time_train = "--- The average training took %s ± %s seconds ---" % (
            sum(time_train) / num_runs, 0)

    # output setting configuration, results, prediction and time used
    output_to_file(
        'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' +
        str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' +
        str(FLAGS.marking_id) + '.txt',
        setting + '\n' + output_valid + '\n' + output_test + '\n' +
        prediction_str + '\n' + time_used + '\n' + average_time_train)
    # output structured evaluation results
    output_to_file(
        'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' +
        str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' +
        str(FLAGS.marking_id) + ' valid.csv', output_csv_valid)
    output_to_file(
        'lda ' + str(FLAGS.dataset) + " T" + str(FLAGS.num_topics) + ' k' +
        str(FLAGS.k_num_doc) + ' i' + str(FLAGS.iterations) + ' gp_id' +
        str(FLAGS.marking_id) + ' test.csv', output_csv_test)
Ejemplo n.º 32
0
def load_pred_data(data_path,
              vocab_word2index, vocab_char2index, vocab_pos2index, vocab_cap2index,
              sentence_len, word_len,
              flag_use_char, flag_use_pos, flag_use_cap):
    """
    :param data_path:
    :param vocab_label2index:
    :param vocab_word2index:
    :param vocab_char2index:
    :param vocab_pos2index:
    :param vocab_cap2index:
    :param sentence_len: max length of word sequence
    :param word_len:  max length of char sequence
    :return: X: [ word_sequence, char_sequence, pos_sequence, cap_sequence]
                - word_sequence: sentence_len
                - char_sequence: sentence_len * word_len
                - pos_sequence: sentence_len
                - cap_sequence: sentence_len
    """
    data_file = codecs.open(data_path, mode='r', encoding='utf-8')
    data_lines = data_file.readlines()
    # build data samples:
    Word_sequences = []
    Char_sequences = []
    Pos_sequences = []
    Cap_sequences = []
    for i, line in enumerate(data_lines):
        raw_list = line.strip().split("\t")
        input_list = raw_list[1].split(" ")
        # get word lists
        word_sequence = [vocab_word2index.get(x, UNK_ID) for x in input_list]
        Word_sequences.append(word_sequence)
        # get char lists
        if flag_use_char:
            char_sequence = [] # [sentence_len, word_len]
            for word in input_list:
                char_indexs = [vocab_char2index.get(char, UNK_ID) for char in word]
                char_sequence.append(char_indexs)
            if len(input_list) < sentence_len:
                char_sequence.extend( [[0]] * (sentence_len-len(input_list)))
            else:
                char_sequence = char_sequence[:sentence_len]
            char_sequence = pad_sequences(char_sequence, maxlen=word_len, value=0.)
            Char_sequences.append(char_sequence)
        if flag_use_pos:
            pos_sequence = nltk.pos_tag(input_list) # [sentence_len]
            word_seq, pos_seq = zip(*pos_sequence)
            pos_sequence = list(pos_seq)
            pos_sequence = [vocab_pos2index.get(pos, UNK_ID) for pos in pos_sequence]
            Pos_sequences.append(pos_sequence)
        if flag_use_cap:
            cap_sequence = [word_capitalize(word) for word in input_list]
            cap_sequence = [vocab_cap2index[cap] for cap in cap_sequence]
            Cap_sequences.append(cap_sequence)
    Word_sequences = pad_sequences(Word_sequences, maxlen=sentence_len, value=0.)
    if flag_use_pos:
        Pos_sequences = pad_sequences(Pos_sequences, maxlen=sentence_len, value=0.)
    if flag_use_cap:
        Cap_sequences = pad_sequences(Cap_sequences, maxlen=sentence_len, value=0.)
    X = {'word':np.array(Word_sequences), 'char':np.array(Char_sequences), 'pos':np.array(Pos_sequences), 'cap':np.array(Cap_sequences)}
    return X, data_lines
print("training data not exist==>load data, and dump it to file system")
vocabulary_word2index, vocabulary_index2word = create_voabulary()
vocab_size=len(vocabulary_word2index)
vocabulary_word2index_label = create_voabulary_label()
train, test, _ =load_data(vocabulary_word2index, vocabulary_word2index_label)
trainX, trainY = train
testX, testY = test
print("testX.shape:",np.array(testX).shape) #2500个list.每个list代表一句话
print("testY.shape:",np.array(testY).shape) #2500个label
print("testX[0]:",testX[0]) #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
print("testX[1]:",testX[1]);print("testY[0]:",testY[0]) #0 ;print("testY[1]:",testY[1]) #0

# 2.Data preprocessing
# Sequence padding
print("start padding & transform to one hot...")
trainX = pad_sequences(trainX, maxlen=100, value=0.) #padding to max length
testX = pad_sequences(testX, maxlen=100, value=0.)   #padding to max length
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=number_classes) #y as one hot
testY = to_categorical(testY, nb_classes=number_classes)   #y as one hot
print("end padding & transform to one hot...")
#--------------------------------------------------------------------------------------------------
    # cache trainX,trainY,testX,testY for next time use.
#    with open(f_cache, 'w') as f:
#        pickle.dump((trainX,trainY,testX,testY,vocab_size),f)
#else:
#    print("traning data exists in cache. going to use it.")

# 3.Building convolutional network
######################################MODEL:1.conv-2.conv-3.conv-4.max_pool-5.dropout-6.FC##############################################################################################
#(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData")
def main(_):
    trainX, trainY, testX, testY = None, None, None, None
    vocabulary_word2index, vocabulary_index2word = create_voabulary()
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label()
    train,test = load_data_with_multilabels(vocabulary_word2index, vocabulary_word2index_label,FLAGS.training_path) #[1,11,3,1998,1998]
    trainX, trainY= train #TODO trainY1999
    testX, testY = test #TODO testY1999
    print("testX.shape:", np.array(testX).shape);print("testY.shape:", np.array(testY).shape)  # 2500个label
    # 2.Data preprocessing
    # Sequence padding
    print("start padding & transform to one hot...")
    trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
    testX = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
    print("end padding & transform to one hot...")

    #2.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        fast_text=fastText(FLAGS.label_size, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.num_sampled,FLAGS.sentence_len,vocab_size,FLAGS.embed_size,FLAGS.is_training)
        #Initialize Save
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding: #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, fast_text)

        curr_epoch=sess.run(fast_text.epoch_step)
        #3.feed data & training
        number_of_training_data=len(trainX)
        batch_size=FLAGS.batch_size
        for epoch in range(curr_epoch,FLAGS.num_epochs):#range(start,stop,step_size)
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)):
                if epoch==0 and counter==0:
                    print("trainX[start:end]:",trainX[start:end]) #2d-array. each element slength is a 100.
                    print("trainY[start:end]:",trainY[start:end]) #a list,each element is a list.element:may be has 1,2,3,4,5 labels.
                    #print("trainY1999[start:end]:",trainY1999[start:end])
                curr_loss,_=sess.run([fast_text.loss_val,fast_text.train_op],feed_dict={fast_text.sentence:trainX[start:end],fast_text.labels:trainY[start:end],}) #fast_text.labels_l1999:trainY1999[start:end]
                loss,counter=loss+curr_loss,counter+1 #acc+curr_acc,
                if counter %500==0:
                    print("Epoch %d\tBatch %d\tTrain Loss:%.3f" %(epoch,counter,loss/float(counter))) #\tTrain Accuracy:%.3f--->,acc/float(counter)

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            # 4.validation
            print("epoch:",epoch,"validate_every:",FLAGS.validate_every,"validate or not:",(epoch % FLAGS.validate_every==0))
            if epoch % FLAGS.validate_every==0:
                eval_loss,eval_accuracy=do_eval(sess,fast_text,testX,testY,batch_size,vocabulary_index2word_label) #testY1999,eval_acc
                print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_accuracy)) #,\tValidation Accuracy: %.3f--->eval_acc
                #save model to checkpoint
                save_path=FLAGS.ckpt_dir+"model.ckpt"
                saver.save(sess,save_path,global_step=epoch) #fast_text.epoch_step

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, fast_text, testX, testY,batch_size,vocabulary_index2word_label) #testY1999
    pass
Ejemplo n.º 35
0
# -*- coding: utf-8 -*-
# Apply an LSTM to IMDB sentiment dataset classification task
import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# IMDB Dataset loading 会自动下载
train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train #22500个元素的list,每个元素类似这样[17,25,10,406,26,14,556,61,62,323,4],可见是词在词库的位置
testX, testY = test

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)# 通过补零把list的每个元素长度都弄成100
testX = pad_sequences(testX, maxlen=100, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)#把0变成[0,1],把1变成[1,0]
testY = to_categorical(testY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=10000, output_dim=128)
net = tflearn.lstm(net, 128, dropout=0.8)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam', learning_rate=0.001,
                         loss='categorical_crossentropy')

# Training
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True,
vocabulary_word2index, vocabulary_index2word = create_voabulary(simple='simple',
                                                                word2vec_model_path=FLAGS.word2vec_model_path,
                                                                name_scope="cnn2")
vocab_size = len(vocabulary_word2index)
vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="cnn2")
questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label, questionid_question_lists)
testX = []
question_id_list = []
for tuple in test:
    question_id, question_string_list = tuple
    question_id_list.append(question_id)
    testX.append(question_string_list)
# 2.Data preprocessing: Sequence padding
print("start padding....")
testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length
print("end padding...")
# 3.create session.
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
graph=tf.Graph().as_default()
global sess
global textCNN
with graph:
    sess=tf.Session(config=config)
# 4.Instantiate Model
    textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size,
                  FLAGS.decay_steps, FLAGS.decay_rate,
                  FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training)
    saver = tf.train.Saver()
    if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
def main(_):
    # 1.load data with vocabulary of words and labels
    vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network")
    vocab_size = len(vocabulary_word2index)
    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network")
    questionid_question_lists=load_final_test_data(FLAGS.predict_source_file)
    test= load_data_predict(vocabulary_word2index,vocabulary_word2index_label,questionid_question_lists)
    testX=[]
    question_id_list=[]
    for tuple in test:
        question_id,question_string_list=tuple
        question_id_list.append(question_id)
        testX.append(question_string_list)
    # 2.Data preprocessing: Sequence padding
    print("start padding....")
    testX2 = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
    testX2_cnn = pad_sequences(testX, maxlen=FLAGS.sentence_len, value=0.)  # padding to max length, for CNN
    print("end padding...")
   # 3.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    graph1 = tf.Graph().as_default()
    graph2 = tf.Graph().as_default()
    graph3 = tf.Graph().as_default()
    graph4 = tf.Graph().as_default()
    graph5 = tf.Graph().as_default()
    global sess_dmn
    global sess_entity
    global sess_cnn
    global sess_rcnn
    with graph1:#DynamicMemoryNetwork
        sess_dmn = tf.Session(config=config)
        model_dmn = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                                     FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass,
                                     use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda)
        saver_dmn = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir_dmn + "checkpoint"):
            print("Restoring Variables from Checkpoint of DMN.")
            saver_dmn.restore(sess_dmn, tf.train.latest_checkpoint(FLAGS.ckpt_dir_dmn))
        else:
            print("Can't find the checkpoint.going to stop.DMN")
            return
    with graph2:#EntityNet
        sess_entity = tf.Session(config=config)
        model_entity = EntityNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                              FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,
                              multi_label_flag=True, block_size=FLAGS.block_size,use_bi_lstm=FLAGS.use_bi_lstm)
        saver_entity = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir_entity + "checkpoint"):
            print("Restoring Variables from Checkpoint of EntityNet.")
            saver_entity.restore(sess_entity, tf.train.latest_checkpoint(FLAGS.ckpt_dir_entity))
        else:
            print("Can't find the checkpoint.going to stop.EntityNet.")
            return
    with graph3:#TextCNN
        sess_cnn=tf.Session(config=config)
        model_cnn = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size,
                          FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sentence_len, vocab_size, FLAGS.embed_size, FLAGS.is_training)
        saver_cnn = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir_cnn + "checkpoint"):
            print("Restoring Variables from Checkpoint.TextCNN.")
            saver_cnn.restore(sess_cnn, tf.train.latest_checkpoint(FLAGS.ckpt_dir_cnn))
        else:
            print("Can't find the checkpoint.going to stop.TextCNN.")
            return
    with graph5:  #TextCNN_256embedding
        sess_cnn_256_embedding = tf.Session(config=config)
        model_cnn_256_embedding = TextCNN(filter_sizes_256_embedding, FLAGS.num_filters_256_embedding, FLAGS.num_classes, FLAGS.learning_rate,
                                FLAGS.batch_size,FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, vocab_size,
                                FLAGS.embed_size_256_embedding, FLAGS.is_training)
        saver_cnn_256_embedding = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir_cnn_256_embedding + "checkpoint"):
            print("Restoring Variables from Checkpoint.TextCNN_256_embedding")
            saver_cnn_256_embedding.restore(sess_cnn_256_embedding, tf.train.latest_checkpoint(FLAGS.ckpt_dir_cnn_256_embedding))
        else:
            print("Can't find the checkpoint.going to stop.TextCNN_256_embedding.")
            return
    #with graph4:#RCNN
    #    sess_rcnn=tf.Session(config=config)
    #    model_rcnn=TextRCNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sentence_len,
    #            vocab_size,FLAGS.embed_size,FLAGS.is_training,FLAGS.batch_size,multi_label_flag=FLAGS.multi_label_flag)
    #    saver_rcnn = tf.train.Saver()
    #    if os.path.exists(FLAGS.ckpt_dir_rcnn + "checkpoint"):
    #        print("Restoring Variables from Checkpoint.TextRCNN.")
    #        saver_rcnn.restore(sess_rcnn, tf.train.latest_checkpoint(FLAGS.ckpt_dir_rcnn))
    #    else:
    #        print("Can't find the checkpoint.going to stop.TextRCNN.")
    #        return

        # 5.feed data, to get logits
        number_of_training_data=len(testX2);print("number_of_training_data:",number_of_training_data)
        index=0
        predict_target_file_f = codecs.open(FLAGS.predict_target_file, 'a', 'utf8')
        global sess_dmn
        global sess_entity
        for start, end in zip(range(0, number_of_training_data, FLAGS.batch_size),range(FLAGS.batch_size, number_of_training_data+1, FLAGS.batch_size)):
            #1.DMN
            logits_dmn=sess_dmn.run(model_dmn.logits,feed_dict={model_dmn.query:testX2[start:end],model_dmn.story: np.expand_dims(testX2[start:end],axis=1),
                                                        model_dmn.dropout_keep_prob:1.0})
            #2.EntityNet
            logits_entity=sess_entity.run(model_entity.logits,feed_dict={model_entity.query:testX2[start:end],model_entity.story: np.expand_dims(testX2[start:end],axis=1),
                                                        model_entity.dropout_keep_prob:1.0})
            #3.CNN
            logits_cnn = sess_cnn.run(model_cnn.logits,feed_dict={model_cnn.input_x: testX2_cnn[start:end], model_cnn.dropout_keep_prob: 1})
            #4.RCNN
            #logits_rcnn = sess_rcnn.run(model_rcnn.logits, feed_dict={model_rcnn.input_x: testX2_cnn[start:end],model_rcnn.dropout_keep_prob: 1})  # 'shape of logits:', ( 1, 1999)
            #5.CN_256_original_embeddding
            logits_cnn_256_embedding =sess_cnn_256_embedding.run(model_cnn_256_embedding.logits,feed_dict={model_cnn_256_embedding.input_x: testX2_cnn[start:end],
                                                                 model_cnn_256_embedding.dropout_keep_prob: 1})
            #how to combine to logits: average
            logits=logits_cnn*0.3+logits_cnn_256_embedding*0.3+logits_entity*0.2+logits_dmn*0.2#+logits_rcnn*0.15
            question_id_sublist=question_id_list[start:end]
            get_label_using_logits_batch(question_id_sublist, logits, vocabulary_index2word_label, predict_target_file_f)
            index=index+1
        predict_target_file_f.close()
Ejemplo n.º 38
0
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1 == 1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary()
        vocab_size = len(vocabulary_word2index)
        vocabulary_word2index_label, _ = create_voabulary_label()
        train, test, _ = load_data(vocabulary_word2index,
                                   vocabulary_word2index_label,
                                   data_type='train')
        trainX, trainY = train
        testX, testY = test
        print("testX.shape:", np.array(testX).shape)  # 2500个list.每个list代表一句话
        print("testY.shape:", np.array(testY).shape)  # 2500个label
        print("testX[0]:",
              testX[0])  # [17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
        print("testX[1]:", testX[1])
        print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0

        # 2.Data preprocessing
        # Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len,
                               value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                              value=0.)  # padding to max length
        ###############################################################################################
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        ###############################################################################################
    print("testX[0]:", testX[0])
    print("testX[1]:", testX[1])
    #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
    # Converting labels to binary vectors
    print("testY[0]:", testY[0])  # 0 ;print("testY[1]:",testY[1]) #0
    print("end padding & transform to one hot...")
    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        fast_text = fastText(FLAGS.label_size, FLAGS.learning_rate,
                             FLAGS.batch_size, FLAGS.decay_steps,
                             FLAGS.decay_rate, FLAGS.num_sampled,
                             FLAGS.sentence_len, vocab_size, FLAGS.embed_size,
                             FLAGS.is_training)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, fast_text)

        curr_epoch = sess.run(fast_text.epoch_step)
        #3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch,
                           FLAGS.num_epochs):  #range(start,stop,step_size)
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                    print("trainY[start:end]:", trainY[start:end])
                curr_loss, curr_acc, _ = sess.run(
                    [
                        fast_text.loss_val, fast_text.accuracy,
                        fast_text.train_op
                    ],
                    feed_dict={
                        fast_text.sentence: trainX[start:end],
                        fast_text.labels: trainY[start:end]
                    })
                loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1
                if counter % 500 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter)))

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, fast_text, testX, testY,
                                              batch_size)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))

                #save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(
                    sess, save_path,
                    global_step=fast_text.epoch_step)  #fast_text.epoch_step

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, fast_text, testX, testY,
                                      batch_size)
    pass
min_frequency = 2 
vp = tflearn.data_utils.VocabularyProcessor(max_tweet_length, min_frequency=min_frequency)
vp = vp.fit(tweets)
val = len(vp.vocabulary_)
print(val)
tweets_parsed = vp.transform(tweets)
vp.save('my_dictionary')
print(vp)

trainX = tweets_parsed
trainY = tflearn.data_utils.to_categorical(content1, nb_classes=0)

filtered_gen = (item for item in trainX)
gen_to_list = list(filtered_gen)

trainX1 = pad_sequences(gen_to_list, maxlen=120, value=0.)
#print(trainX1)



# Network building
net = tflearn.input_data([None, 120])
net = tflearn.embedding(net, input_dim=val, output_dim=64)
net = tflearn.lstm(net, 64)
net = tflearn.dropout(net, 0.5)
net = tflearn.fully_connected(net, 2, activation='softmax')
net = tflearn.regression(net, optimizer='adam',loss='binary_crossentropy')


# Training
model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=0)
Ejemplo n.º 40
0
print(train_y.values)
train_y = np.reshape(train_y.values, (-1, 1))
test_y = np.reshape(test_y.values, (-1, 1))
#train_y = tf.constant(train_y.values)
print(test_y)
# Data preprocessing
embeddings, vocab = get_embeddings(efile_name)
print(embeddings.shape)
vocab_size, embeddings_dim = embeddings.shape
print(vocab_size, embeddings_dim)
train_x, test_x = vectorize(train_x, test_x, embeddings, vocab, 25,
                            unknown_token)

embeddings = embeddings.as_matrix()
train_x = pad_sequences(train_x, maxlen=25, value=0.)
test_x = pad_sequences(test_x, maxlen=25, value=0.)

batch_size = 8

# net building
net = tflearn.input_data([None, 25])
net = tflearn.embedding(net,
                        input_dim=len(vocab),
                        output_dim=400,
                        trainable=False,
                        name="EmbeddingLayer")
net = tflearn.layers.normalization.batch_normalization(net)
#print('shape: ', net.shape)
#net = tflearn.reshape(net, [None, 25, 400])
print('shape: ', net.shape)
Ejemplo n.º 41
0
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1 == 1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary(
            word2vec_model_path=FLAGS.word2vec_model_path,
            name_scope="cnn2")  #simple='simple'
        vocab_size = len(vocabulary_word2index)
        print("cnn_model.vocab_size:", vocab_size)
        vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
            name_scope="cnn2")
        if FLAGS.multi_label_flag:
            FLAGS.traning_data_path = 'training-data/train-zhihu6-title-desc.txt'  #test-zhihu5-only-title-multilabel.txt
        train, test, _ = load_data_multilabel_new(
            vocabulary_word2index,
            vocabulary_word2index_label,
            multi_label_flag=FLAGS.multi_label_flag,
            traning_data_path=FLAGS.traning_data_path
        )  #,traning_data_path=FLAGS.traning_data_path
        trainX, trainY = train
        testX, testY = test
        # 2.Data preprocessing.Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len,
                               value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                              value=0.)  # padding to max length
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        print("trainX[0]:", trainX[0])  #;print("trainY[0]:", trainY[0])
        # Converting labels to binary vectors
        print("end padding & transform to one hot...")
    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        textCNN = TextCNN(filter_sizes,
                          FLAGS.num_filters,
                          FLAGS.num_classes,
                          FLAGS.learning_rate,
                          FLAGS.batch_size,
                          FLAGS.decay_steps,
                          FLAGS.decay_rate,
                          FLAGS.sentence_len,
                          vocab_size,
                          FLAGS.embed_size,
                          FLAGS.is_training,
                          multi_label_flag=FLAGS.multi_label_flag)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #load pre-trained word embedding
                assign_pretrained_word_embedding(
                    sess,
                    vocabulary_index2word,
                    vocab_size,
                    textCNN,
                    word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch = sess.run(textCNN.epoch_step)
        #3.feed data & training
        number_of_training_data = len(trainX)
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end]
                          )  #;print("trainY[start:end]:",trainY[start:end])
                feed_dict = {
                    textCNN.input_x: trainX[start:end],
                    textCNN.dropout_keep_prob: 0.5
                }
                if not FLAGS.multi_label_flag:
                    feed_dict[textCNN.input_y] = trainY[start:end]
                else:
                    feed_dict[textCNN.input_y_multilabel] = trainY[start:end]
                curr_loss, curr_acc, _ = sess.run(
                    [textCNN.loss_val, textCNN.accuracy, textCNN.train_op],
                    feed_dict)  #curr_acc--->TextCNN.accuracy
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                if counter % 50 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter))
                    )  #tTrain Accuracy:%.3f---》acc/float(counter)

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(textCNN.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, textCNN, testX, testY,
                                              batch_size,
                                              vocabulary_index2word_label)
                print(
                    "Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f"
                    % (epoch, eval_loss, eval_acc))
                #save model to checkpoint
                save_path = FLAGS.ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=epoch)

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, textCNN, testX, testY, batch_size,
                                      vocabulary_index2word_label)
    pass
Ejemplo n.º 42
0
def test_pad():
    trainX = 'w18476 w4454 w1674 w6 w25 w474 w1333 w1467 w863 w6 w4430 w11 w813 w4463 w863 w6 w4430 w111'
    trainX = trainX.split(" ")
    trainX = pad_sequences([[trainX]], maxlen=100, value=0.)
    print("trainX:", trainX)
Ejemplo n.º 43
0
print("started...")
# 1.IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl',
                                n_words=10000,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test
print("testX.shape:", np.array(testX).shape)  #2500个list.每个list代表一句话
print("testY.shape:", np.array(testY).shape)  #2500个label
print("testX[0]:", testX[0])  #[17, 25, 10, 406, 26, 14, 56, 61, 62, 323, 4]
print("testY[0]:", testY[0])  #0

# 2.Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=100, value=0.)  #padding to max length
testX = pad_sequences(testX, maxlen=100, value=0.)  #padding to max length
# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)  #y as one hot
testY = to_categorical(testY, nb_classes=2)  #y as one hot

# 3.Building convolutional network
#(shape=None, placeholder=None, dtype=tf.float32,data_preprocessing=None, data_augmentation=None,name="InputData")
network = input_data(
    shape=[None, 100], name='input'
)  #[None, 100] `input_data` is used as a data entry (placeholder) of a network. This placeholder will be feeded with data when training
network = tflearn.embedding(
    network, input_dim=10000, output_dim=128
)  #[None, 100,128].embedding layer for a sequence of ids. network: Incoming 2-D Tensor. input_dim: vocabulary size, oput_dim:embedding size
#conv_1d(incoming,nb_filter,filter_size)
branch1 = conv_1d(
Ejemplo n.º 44
0
n_datas = 10000

# IMDB Dataset loading
train, test, _ = imdb.load_data(path='imdb.pkl',
                                n_words=n_datas,
                                valid_portion=0.1)
trainX, trainY = train
testX, testY = test

# Data preprocessing
# NOTE: Padding is required for dimension consistency. This will pad sequences
# with 0 at the end, until it reaches the max sequence length. 0 is used as a
# masking value by dynamic RNNs in TFLearn; a sequence length will be
# retrieved by counting non zero elements in a sequence. Then dynamic RNN step
# computation is performed according to that length.
trainX = pad_sequences(trainX, maxlen=input_length_each_seq, value=0.)
testX = pad_sequences(testX, maxlen=input_length_each_seq, value=0.)
# Converting labels to binary vectors
trainY = to_categorical(trainY, n_class)
testY = to_categorical(testY, n_class)

# Network building
net = tflearn.input_data([None, input_length_each_seq])
# Masking is not required for embedding, sequence length is computed prior to
# the embedding op and assigned as 'seq_length' attribute to the returned Tensor.
net = tflearn.embedding(net, input_dim=n_datas, output_dim=hiddle_layes)
net = tflearn.lstm(net, hiddle_layes_2, dropout=0.8, dynamic=True)
net = tflearn.fully_connected(net, n_class, activation='softmax')
net = tflearn.regression(net,
                         optimizer='adam',
                         learning_rate=0.001,
Ejemplo n.º 45
0
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
    X, Y, test_size=test_ratio, random_state=2017)

Y_train = to_categorical(Y_train, nb_classes=len(qualities))
Y_test = to_categorical(Y_test, nb_classes=len(qualities))

### Process vocabulary

print('Process vocabulary')

vocab_processor = tflearn.data_utils.VocabularyProcessor(
    max_document_length=model_size, min_frequency=0)
X_train = np.array(list(vocab_processor.fit_transform(X_train)))
X_test = np.array(list(vocab_processor.fit_transform(X_test)))

X_train = pad_sequences(X_train, maxlen=model_size, value=0.)
X_test = pad_sequences(X_test, maxlen=model_size, value=0.)

n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)

# pickle.dump (X_train, open ("xtrain.p", b))
# pickle.dump (X_test, open ("xtest.p", b))

# X_train = pickle.load (open ("xtrain.p", rb))
# X_test = pickle.load (open ("xtest.p", rb))

### Models

print('Build model')
Ejemplo n.º 46
0
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # load training data from cache file.
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1==1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="dynamic_memory_network") #simple='simple'
        vocab_size = len(vocabulary_word2index)
        print("dynamic_memory_network.vocab_size:",vocab_size)
        vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="dynamic_memory_network")
        if FLAGS.multi_label_flag:
            FLAGS.traning_data_path='../training-data/train-zhihu6-title-desc.txt' #change this line if want to train in a small dataset. e.g. dataset from 'test-zhihu6-title-desc.txt'
        train,test,_=load_data_multilabel_new(vocabulary_word2index,vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag,
                                              traning_data_path=FLAGS.traning_data_path)
        trainX, trainY = train
        testX, testY = test

        print("trainY:",trainY[0:10])
        # 2.Data preprocessing.Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0])
        # Converting labels to binary vectors
        print("end padding & transform to one hot...")
    #2.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        model = DynamicMemoryNetwork(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                                     FLAGS.story_length,vocab_size, FLAGS.embed_size, FLAGS.hidden_size, FLAGS.is_training,num_pass=FLAGS.num_pass,
                                     use_gated_gru=FLAGS.use_gated_gru,decode_with_sequences=FLAGS.decode_with_sequences,multi_label_flag=FLAGS.multi_label_flag,l2_lambda=FLAGS.l2_lambda)
        #Initialize Save
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding: #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, model,word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch=sess.run(model.epoch_step)
        #3.feed data & training
        number_of_training_data=len(trainX)
        print("number_of_training_data:",number_of_training_data)
        previous_eval_loss=10000
        best_eval_loss=10000
        batch_size=FLAGS.batch_size
        for epoch in range(curr_epoch,FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)):
                if epoch==0 and counter==0:
                    print("trainX[start:end]:",trainX[start:end])#;print("trainY[start:end]:",trainY[start:end])
                feed_dict = {model.query: trainX[start:end],model.story: np.expand_dims(trainX[start:end],axis=1),model.dropout_keep_prob: 1.0}
                if not FLAGS.multi_label_flag:
                    feed_dict[model.answer_single] = trainY[start:end]
                else:
                    feed_dict[model.answer_multilabel]=trainY[start:end]
                curr_loss,curr_acc,_=sess.run([model.loss_val,model.accuracy,model.train_op],feed_dict) #curr_acc--->TextCNN.accuracy
                loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc
                if counter %50==0:
                    print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                          %(epoch,counter,math.exp(loss/float(counter)) if (loss/float(counter))<20 else 10000.000,acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter)
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################
                if FLAGS.batch_size!=0 and (start%(FLAGS.validate_step*FLAGS.batch_size)==0): #(epoch % FLAGS.validate_every) or  if epoch % FLAGS.validate_every == 0:
                    eval_loss, eval_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label)
                    print("dynamic_memory_network[use_gated_gru=False,num_pass=2].validation.part. previous_eval_loss:", math.exp(previous_eval_loss) if previous_eval_loss<20 else 10000.000,";current_eval_loss:", math.exp(eval_loss) if eval_loss<20 else 10000.000)
                    if eval_loss > previous_eval_loss: #if loss is not decreasing
                        # reduce the learning rate by a factor of 0.5
                        print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>validation.part.going to reduce the learning rate.")
                        learning_rate1 = sess.run(model.learning_rate)
                        lrr=sess.run([model.learning_rate_decay_half_op])
                        learning_rate2 = sess.run(model.learning_rate)
                        print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:",learning_rate2)
                    else:# loss is decreasing
                        if eval_loss<best_eval_loss:
                            print("dynamic_memory_network[use_gated_gru=False,num_pass=2]==>going to save the model.eval_loss:",math.exp(eval_loss) if eval_loss<20 else 10000.000,";best_eval_loss:",math.exp(best_eval_loss) if best_eval_loss<20 else 10000.000)
                            # save model to checkpoint
                            save_path = FLAGS.ckpt_dir + "model.ckpt"
                            saver.save(sess, save_path, global_step=epoch)
                            best_eval_loss=eval_loss
                    previous_eval_loss = eval_loss
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(model.epoch_increment)

        # 5.test on test set
        test_loss, test_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label)
    pass
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1==1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="transformer_classification") #simple='simple'
        vocab_size = len(vocabulary_word2index)
        print("transformer.vocab_size:",vocab_size)
        vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="transformer_classification")
        if FLAGS.multi_label_flag:
            FLAGS.traning_data_path='training-data/test-zhihu6-title-desc.txt' #one record like this:'w35620 w1097 w111 c278 c150 c150 c285 c278 c43 __label__7756633728210171144 3195914392210930723'
        train,test,_=load_data_multilabel_new(vocabulary_word2index,vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag,
                                              traning_data_path=FLAGS.traning_data_path)
        trainX, trainY, = train
        testX, testY = test

        print("trainY:",trainY[0:10])
        # 2.Data preprocessing.Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0])
        # Converting labels to binary vectors
        print("end padding & transform to one hot...")
    #2.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        model=Transformer(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sequence_length,
                 vocab_size, FLAGS.embed_size,FLAGS.d_model,FLAGS.d_k,FLAGS.d_v,FLAGS.h,FLAGS.num_layer,FLAGS.is_training,l2_lambda=FLAGS.l2_lambda)
        #Initialize Save
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding: #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, model,word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch=sess.run(model.epoch_step)
        #3.feed data & training
        number_of_training_data=len(trainX)
        print("number_of_training_data:",number_of_training_data)
        previous_eval_loss=10000
        best_eval_loss=10000
        batch_size=FLAGS.batch_size
        for epoch in range(curr_epoch,FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)):
                if epoch==0 and counter==0:
                    print("trainX[start:end]:",trainX[start:end])
                feed_dict = {model.input_x: trainX[start:end],model.dropout_keep_prob: 0.5}
                feed_dict[model.input_y_label]=trainY[start:end]
                curr_loss,curr_acc,_=sess.run([model.loss_val,model.accuracy,model.train_op],feed_dict) #curr_acc--->TextCNN.accuracy
                loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc
                if counter %50==0:
                    print("transformer.classification==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter)
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################
                if FLAGS.batch_size!=0 and (start%(FLAGS.validate_step*FLAGS.batch_size)==0):
                    eval_loss, eval_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label)
                    print("transformer.classification.validation.part. previous_eval_loss:", previous_eval_loss,";current_eval_loss:",eval_loss)
                    if eval_loss > previous_eval_loss: #if loss is not decreasing
                        # reduce the learning rate by a factor of 0.5
                        print("transformer.classification.==>validation.part.going to reduce the learning rate.")
                        learning_rate1 = sess.run(model.learning_rate)
                        lrr=sess.run([model.learning_rate_decay_half_op])
                        learning_rate2 = sess.run(model.learning_rate)
                        print("transformer.classification==>validation.part.learning_rate1:", learning_rate1, " ;learning_rate2:",learning_rate2)
                    #print("HierAtten==>Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc))
                    else:# loss is decreasing
                        if eval_loss<best_eval_loss:
                            print("transformer.classification==>going to save the model.eval_loss:",eval_loss,";best_eval_loss:",best_eval_loss)
                            # save model to checkpoint
                            save_path = FLAGS.ckpt_dir + "model.ckpt"
                            saver.save(sess, save_path, global_step=epoch)
                            best_eval_loss=eval_loss
                    previous_eval_loss = eval_loss
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(model.epoch_increment)

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, model, testX, testY, batch_size,vocabulary_index2word_label)
    pass
Ejemplo n.º 48
0
from tflearn.layers.estimator import regression

#Get data
trainX, embeddings, trainY, maxLen, POS_labels = data.get_Data_Embeddings()
POS_vectors, _ = labelMatrix2OneHot(POS_labels)
del data

print("TrainX : ", len(trainX))
print("TrainY : ", len(trainY))
print("Embd : ", len(embeddings))
print("POS : ", len(POS_labels))
print("Max Len : ", maxLen)

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=maxLen, value=0)
#Converting labels to binary vectors
trainY = pad_sequences(trainY, maxlen=maxLen, value=0)
embeddings = concat_2Dvectors(embeddings, Flatten_3Dto2D(POS_vectors))

# Network building
print("Beginning neural network")
net = input_data(shape=[None, maxLen])
net = embedding(net,
                input_dim=len(embeddings),
                output_dim=len(embeddings[0]),
                trainable=False,
                name="EmbeddingLayer")
print("After embeddings : ", net.get_shape().as_list())
net = bidirectional_rnn(net,
                        BasicLSTMCell(1024),
Ejemplo n.º 49
0
def main():
    train_x, train_y, val_x, val_y, test_x, test_y, vocab_size = load_data()

    label_size = 10
    learning_rate = 0.01
    batch_size = 128
    decay_steps = 20000
    decay_rate = 0.8
    ckpt_dir = "fast_text_checkpoint/"
    sentence_len = 200
    embed_size = 100
    is_training = True
    num_epochs = 15
    validate_every = 1


    print("start padding...")

    train_x = pad_sequences(train_x, maxlen=sentence_len, value = 0)
    val_x = pad_sequences(val_x, maxlen=sentence_len, value = 0)
    test_x = pad_sequences(test_x, maxlen=sentence_len, value=0)
    print("end padding...")

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Session(config = config) as sess:

        fast_text = fastText(label_size = 10,
                             learning_rate = 0.01,
                             batch_size = 128,
                             decay_step = 20000,
                             decay_rate = 0.8,
                             sentence_len =  200,
                             vocab_size = vocab_size,
                             embed_size = 100,
                             is_training = True)

        saver = tf.train.Saver()
        if os.path.exists(ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())

        curr_epoch = sess.run(fast_text.epoch_step)

        number_of_training_data = len(train_x)
        batch_size = batch_size

        for epoch in range(curr_epoch, num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)):

                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:",train_x[start:end].shape)
                    print("trainY[start:end]:",train_y[start:end].shape)


                curr_loss, curr_acc, _ = sess.run([fast_text.loss_val, fast_text.accuracy, fast_text.train_op],
                                                  feed_dict= \
                                                      {   fast_text.sentence : train_x[start : end],
                                                          fast_text.labels : train_y[start : end]}
                                                  )
                loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1

                if counter % 500 == 0:
                    print(epoch)
                    print(counter)
                    print(loss)
                    print(acc)
                    print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)))

            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            print(epoch, validate_every, (epoch % validate_every == 0))

            if epoch % validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, fast_text, val_x, val_y, batch_size)
                print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc))

                # save model to checkpoint
                save_path = ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=fast_text.epoch_step)  # fast_text.epoch_step

        test_loss, test_acc = do_eval(sess, fast_text, test_x, test_y, batch_size)
        print("test Loss:%.3f\ttest Accuracy: %.3f" % (test_loss, test_acc))
    return
Ejemplo n.º 50
0
def load_data(traning_data_path,
              vocab_word2index,
              vocab_label2index,
              sentence_len,
              name_scope,
              training_portion=0.95,
              tokenize_style='char'):
    """
    convert data as indexes using word2index dicts.
    :param traning_data_path:
    :param vocab_word2index:
    :param vocab_label2index:
    :return:
    """
    cache_data_dir = 'cache' + "_" + name_scope  # path to save cache
    cache_file = cache_data_dir + "/" + 'train_valid_test.pik'
    print("cache_path:", cache_file, "train_valid_test_file_exists:",
          os.path.exists(cache_file))
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as data_f:
            print("going to load cache file from file system and return")
            return pickle.load(data_f)

    csvfile = open(traning_data_path, 'r')
    spamreader = csv.reader(csvfile, delimiter='\t', quotechar='|')
    label_size = len(vocab_label2index)
    X1_ = []
    X2_ = []
    Y_ = []

    tfidf_source_file = './data/atec_nl_sim_train.txt'
    tfidf_target_file = './data/atec_nl_sim_tfidf.txt'
    if not os.path.exists(tfidf_target_file):
        get_tfidf_score_and_save(tfidf_source_file, tfidf_target_file)

    BLUE_SCORES_ = []
    word_vec_fasttext_dict = load_word_vec(
        'data/fasttext_fin_model_50.vec')  #word embedding from fasttxt
    word_vec_word2vec_dict = load_word_vec(
        'data/word2vec.txt')  #word embedding from word2vec
    #word2vec.word2vec('/Users/test/PycharmProjects/question_answering_similarity/data/atec_additional_cropus.txt',
    #                  '/Users/test/PycharmProjects/question_answering_similarity/data/word2vec_fin.bin', size=50, verbose=True,kind='txt')
    #print("word_vec_word2vec_dict:",word_vec_word2vec_dict)
    tfidf_dict = load_tfidf_dict('data/atec_nl_sim_tfidf.txt')

    for i, row in enumerate(
            spamreader
    ):  ##row:['\ufeff1', '\ufeff怎么更改花呗手机号码', '我的花呗是以前的手机号码,怎么更改成现在的支付宝的号码手机号', '1']
        x1_list = token_string_as_list(row[1], tokenize_style=tokenize_style)
        x1 = [vocab_word2index.get(x, UNK_ID) for x in x1_list]
        x2_list = token_string_as_list(row[2], tokenize_style=tokenize_style)
        x2 = [vocab_word2index.get(x, UNK_ID) for x in x2_list]
        #add blue score features 2018-05-06
        features_vector = data_mining_features(i,
                                               row[1],
                                               row[2],
                                               vocab_word2index,
                                               word_vec_fasttext_dict,
                                               word_vec_word2vec_dict,
                                               tfidf_dict,
                                               n_gram=8)
        features_vector = [float(x) for x in features_vector]
        BLUE_SCORES_.append(features_vector)
        y_ = row[3]
        y = vocab_label2index[y_]
        X1_.append(x1)
        X2_.append(x2)
        Y_.append(y)

        if i == 0 or i == 1 or i == 2:
            print(i, "row[1]:", row[1], ";x1:")
            print(row[1].decode("utf-8"))
            print(i, "row[2]:", row[2], ";x2:")
            print(row[2].decode("utf-8"))
            print(i, "row[3]:", row[3], ";y:", str(y))
            print(i, "row[4].feature vectors:", features_vector)

    number_examples = len(Y_)

    #shuffle
    X1 = []
    X2 = []
    Y = []
    BLUE_SCORES = []
    permutation = np.random.permutation(number_examples)
    for index in permutation:
        X1.append(X1_[index])
        X2.append(X2_[index])
        Y.append(Y_[index])
        BLUE_SCORES.append(BLUE_SCORES_[index])

    X1 = pad_sequences(X1, maxlen=sentence_len,
                       value=0.)  # padding to max length
    X2 = pad_sequences(X2, maxlen=sentence_len,
                       value=0.)  # padding to max length
    valid_number = min(1600, int((1 - training_portion) * number_examples))
    test_number = 800
    training_number = number_examples - valid_number - test_number
    valid_end = training_number + valid_number
    print(";training_number:", training_number, "valid_number:", valid_number,
          ";test_number:", test_number)
    #generate more training data, while still keep data distribution for valid and test.
    X1_final, X2_final, BLUE_SCORE_final, Y_final, training_number_big = get_training_data(
        X1[0:training_number], X2[0:training_number],
        BLUE_SCORES[0:training_number], Y[0:training_number], training_number)
    train = (X1_final, X2_final, BLUE_SCORE_final, Y_final)
    valid = (X1[training_number + 1:valid_end],
             X2[training_number + 1:valid_end],
             BLUE_SCORES[training_number + 1:valid_end],
             Y[training_number + 1:valid_end])
    test = (X1[valid_end + 1:], X2[valid_end:], BLUE_SCORES[valid_end:],
            Y[valid_end:])

    true_label_numbers = len([y for y in Y if y == 1])
    true_label_pert = float(true_label_numbers) / float(number_examples)

    #save train/valid/test/true_label_pert to file system as cache
    # save to file system if vocabulary of words not exists(pickle).
    if not os.path.exists(cache_file):
        with open(cache_file, 'ab') as data_f:
            print("going to dump train/valid/test data to file sytem.")
            pickle.dump((train, valid, test, true_label_pert), data_f)
    return train, valid, test, true_label_pert
Ejemplo n.º 51
0
def main(_):
    # load data
    trainX, trainY, testX, testY = None, None, None, None
    vocabulary_word2index, vocabulary_index2word = create_voabulary(
        word2vec_vocabulary_path='../../utils/dump/vocabulary',
        name_scope="TextCNN")
    vocab_size = len(vocabulary_word2index)
    print("cnn_model.vocab_size:", vocab_size)

    vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
        regression_flag=FLAGS.regression_flag,
        vocabulary_label='../../input/tourist.zh.train.txt',
        name_scope="TextCNN")
    # train, test, _ = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label, using_kfold=True,
    #                                           training_data_path='../../input/tourist.zh.train.txt',
    #                                           multi_label_flag=FLAGS.multi_label_flag)
    kf_id = -1
    for train_X, train_y, valid_X, valid_y, ID in load_data_multilabel_new(
            vocabulary_word2index,
            vocabulary_word2index_label,
            using_kfold=True,
            training_data_path='../../input/tourist.zh.train.txt',
            multi_label_flag=FLAGS.multi_label_flag):
        kf_id += 1
        if kf_id != 4: continue
        trainX, trainY = train_X, train_y
        testX, testY = valid_X, valid_y
        print('hello', ID)

        # 2. Data preprocessing.Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sentence_len,
                               value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                              value=0.)  # padding to max length
        print("trainX[0]:", trainX[0])

        # Converting labels to binary vectors
        print("end padding & transform to one hot...")

        #2.create session.
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            #Instantiate Model
            textCNN = TextCNN(filter_sizes,
                              FLAGS.num_filters,
                              FLAGS.num_classes,
                              FLAGS.learning_rate,
                              FLAGS.batch_size,
                              FLAGS.decay_steps,
                              FLAGS.decay_rate,
                              FLAGS.sentence_len,
                              vocab_size,
                              FLAGS.embed_size,
                              FLAGS.is_training,
                              regression_flag=FLAGS.regression_flag,
                              multi_label_flag=FLAGS.multi_label_flag)

            train_write = tf.summary.FileWriter('log/train_{}'.format(kf_id),
                                                sess.graph)
            test_write = tf.summary.FileWriter('log/test_{}'.format(kf_id))
            merged = tf.summary.merge_all()

            #Initialize Save
            saver = tf.train.Saver()
            if os.path.exists(FLAGS.ckpt_dir + str(kf_id) + "/" +
                              "checkpoint_{}".format(kf_id)):
                print("Restoring Variables from Checkpoint")
                saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            else:
                print('Initializing Variables')
                sess.run(tf.global_variables_initializer())
                if FLAGS.use_embedding:  #load pre-trained word embedding
                    assign_pretrained_word_embedding(
                        sess,
                        vocabulary_index2word,
                        vocab_size,
                        textCNN,
                        word2vec_model_path=FLAGS.word2vec_model_path)

            curr_epoch = sess.run(textCNN.epoch_step)
            # 3.feed data & training
            number_of_training_data = len(trainX)
            batch_size = FLAGS.batch_size
            index = 0
            for epoch in range(curr_epoch, FLAGS.num_epochs):
                loss, acc, counter = 0.0, 0.0, 0
                # 批处理
                for start, end in zip(
                        range(0, number_of_training_data, batch_size),
                        range(batch_size, number_of_training_data,
                              batch_size)):
                    if epoch == 0 and counter == 0:
                        print(
                            "trainX[start:end]:", trainX[start:end]
                        )  #;print("trainY[start:end]:",trainY[start:end])
                    feed_dict = {
                        textCNN.input_x: trainX[start:end],
                        textCNN.dropout_keep_prob: 0.5
                    }
                    if not FLAGS.multi_label_flag:
                        feed_dict[textCNN.input_y] = trainY[start:end]
                    else:
                        feed_dict[
                            textCNN.input_y_multilabel] = trainY[start:end]
                    summary, curr_loss, curr_acc, _ = sess.run([
                        merged, textCNN.loss_val, textCNN.accuracy,
                        textCNN.train_op
                    ], feed_dict)  #curr_acc--->TextCNN.accuracy
                    loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                    if counter % 50 == 0:
                        print(
                            "Epoch %d\tBatch %d\tTrain Loss:%.5f\tTrain Accuracy:%.5f"
                            % (epoch, counter, loss / float(counter),
                               acc / float(counter))
                        )  #tTrain Accuracy:%.3f---》acc/float(counter)
                    train_write.add_summary(summary, index)
                    index += 1
                #epoch increment
                print("going to increment epoch counter....")
                sess.run(textCNN.epoch_increment)

                # 4.validation
                print(epoch, FLAGS.validate_every,
                      (epoch % FLAGS.validate_every == 0))
                if epoch % FLAGS.validate_every == 0:
                    eval_loss, eval_acc = do_eval(sess, merged, test_write,
                                                  index, textCNN, testX, testY,
                                                  batch_size,
                                                  vocabulary_index2word_label)
                    print(
                        "Epoch %d Validation Loss:%.5f\tValidation Accuracy: %.5f"
                        % (epoch, eval_loss, eval_acc))
                    #save model to checkpoint
                    save_path = FLAGS.ckpt_dir + str(
                        kf_id) + "/" + "model.ckpt"
                    saver.save(sess, save_path, global_step=epoch)

            # 5. 最后在测试集上做测试,并报告测试准确率 Test
            # test_loss, test_acc = do_eval(sess, merged,test_write,epoch, textCNN, testX, testY, batch_size, vocabulary_index2word_label)
            # print("Validation Loss:%.5f\tValidation Accuracy: %.5f" % (test_loss, test_acc))

            # 6. 自定义衡量指标
            self_acc = _eval(sess,
                             textCNN,
                             testX,
                             testY,
                             vocabulary_index2word_label,
                             ID=ID,
                             kf_id=kf_id,
                             regression_flag=FLAGS.regression_flag,
                             namse_scope='demon')

        train_write.close()
    pass
def main(_):
    X_train, X_val, y_train, y_val, n_classes = train_test_loader()
    with open('data/vocab.dic', 'rb') as f:
        vocab = pickle.load(f)
    vocab_size = len(vocab) + 1
    print('size of vocabulary: {}'.format(vocab_size))

    # padding sentences
    X_train = pad_sequences(X_train, maxlen=FLAGS.sentence_len, value=float(vocab_size - 1))
    X_val = pad_sequences(X_val, maxlen=FLAGS.sentence_len, value=float(vocab_size - 1))
    # convert label to one-hot encode
    # to_categorical(y_train, n_classes)
    # to_categorical(y_val, n_classes)

    # create session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        # Instantiate Model
        textrcnn = TextRCNN(FLAGS.num_classes,
                FLAGS.learning_rate, FLAGS.batch_size,
                FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len,
                vocab_size, FLAGS.embed_size, FLAGS.is_training)
        # Initialize save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + 'checkpoint'):
            print('restoring variables from checkpoint')
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:
                assign_pretrained_word_embedding(sess, vocab, vocab_size, textrcnn)
        curr_epoch = sess.run(textrcnn.epoch_step)

        # feed data and training
        number_of_training_data = len(X_train)
        batch_size = FLAGS.batch_size
        best_val_acc = 0.0
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, acc, counter = .0, .0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 or counter == 0:
                    pass
                    # print('X_train[start:end]: {}'.format(X_train[start:end]))
                feed_dict = {
                        textrcnn.input_x: X_train[start:end],
                        textrcnn.dropout_keep_prob: 0.5}
                if not FLAGS.multi_label_flag:
                    feed_dict[textrcnn.input_y] = y_train[start:end]
                else:
                    feed_dict[textrcnn.input_y_multilabel] = y_train[start:end]
                curr_loss, curr_acc, _ = sess.run(
                        [textrcnn.loss_val, textrcnn.accuracy, textrcnn.train_op],
                        feed_dict)
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc

                if counter % 50 == 0:
                    print('Epoch {}\tBatch {}\tTrain Loss {}\tTrain Accuracy {}'\
                            .format(
                                epoch, counter,
                                loss / float(counter),
                                acc / float(counter)))
            print('going to increment epoch counter ...')
            sess.run(textrcnn.epoch_increment)

            # validation
            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_acc = do_eval(
                        sess, textrcnn, X_val, y_val, batch_size)
                print("Epoch {} Validation Loss: {}\tValidation Accuracy: {}"\
                        .format(epoch, eval_loss, eval_acc))
                if eval_acc > best_val_acc:
                    best_val_acc = eval_acc
                    # save model to checkpoint
                    save_path = FLAGS.ckpt_dir + "model.ckpt"
                    saver.save(sess, save_path, global_step=epoch)
                else:
                    saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
                    break

        # report result
        test_loss, test_acc = do_eval(sess, textrcnn, X_val, y_val, batch_size)
Ejemplo n.º 53
0
    name_scope="cnn2")
vocab_size = len(vocabulary_word2index)
vocabulary_word2index_label, vocabulary_index2word_label = create_voabulary_label(
    name_scope="cnn2")
questionid_question_lists = load_final_test_data(FLAGS.predict_source_file)
test = load_data_predict(vocabulary_word2index, vocabulary_word2index_label,
                         questionid_question_lists)
testX = []
question_id_list = []
for tuple in test:
    question_id, question_string_list = tuple
    question_id_list.append(question_id)
    testX.append(question_string_list)
# 2.Data preprocessing: Sequence padding
print("start padding....")
testX2 = pad_sequences(testX, maxlen=FLAGS.sentence_len,
                       value=0.)  # padding to max length
print("end padding...")
# 3.create session.
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
graph = tf.Graph().as_default()
global sess
global textCNN
with graph:
    sess = tf.Session(config=config)
    # 4.Instantiate Model
    textCNN = TextCNN(filter_sizes, FLAGS.num_filters, FLAGS.num_classes,
                      FLAGS.learning_rate, FLAGS.batch_size, FLAGS.decay_steps,
                      FLAGS.decay_rate, FLAGS.sentence_len, vocab_size,
                      FLAGS.embed_size, FLAGS.is_training)
    saver = tf.train.Saver()
Ejemplo n.º 54
0
"""
from __future__ import division, print_function, absolute_import

import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool, max_pool_1d
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from data.data_glass import *
trainX, trainY, testX, testY = getGlassData()

# Data preprocessing
# Sequence padding
trainX = pad_sequences(trainX, maxlen=10, value=0.)
testX = pad_sequences(testX, maxlen=10, value=0.)
# # # Converting labels to binary vectors
trainY = to_categorical(trainY, 6)
testY = to_categorical(testY, 6)

network = input_data(shape=[None, 10], name='input')
network = tflearn.embedding(network, input_dim=1000, output_dim=128)
branch1 = conv_1d(network,
                  128,
                  3,
                  padding='valid',
                  activation='relu',
                  regularizer="L2")
branch1 = max_pool_1d(branch1, 2)
branch2 = conv_1d(network,
Ejemplo n.º 55
0
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from tflearn.data_utils import to_categorical, pad_sequences

data = keras.datasets.imdb


# IMDB load dataset
train, test= data.load_data(num_words=10000) # valid_portion=0.1 (10%) it hepls preventing overfitting
trainX, trainY = train
testX, testY = test

# Data preprocessing
# Sequence padding
trainX =pad_sequences(trainX, maxlen = 100) # pad_sequences is it converts the each review in matrix and padding it add 0 from all it's border.padding necessary for input consistency and dimentionality.
testX = pad_sequences(trainX, maxlen = 100)
# max_length =100 iswords of length or we can change to 256 or 200 to increase accuracy.

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2) # nb_classes=2 whether it is positive or negative
testY = to_categorical(testY, nb_classes=2)

# create neural network
model = Sequential()
model.add(Embedding(10000, 16)) # it embed a value to ex. 0 value when embedding it's like 0:[0.2,0.3,0.4] and it goes to 16 dimension
                                                    # ex. 7 value when embedding it's like 7:[7,7.3,9] and it  goes to 16 dimension              
model.add(GlobalAveragePooling1D())# it take the  average of the embedding layer so it can reduce dimensionality
model.add(Dense(16, activation='relu'))
model.add(Dense(2,activation='sigmoid'))
Ejemplo n.º 56
0
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1 == 1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_vocabulary(
            word2vec_model_path=FLAGS.word2vec_model_path,
            name_scope="transformer_classification")
        vocab_size = len(vocabulary_word2index)
        print("transformer.vocab_size:", vocab_size)
        train, test, _ = load_data_multilabel_new(
            vocabulary_word2index, training_data_path=FLAGS.training_data_path)

        compare_train_data = WikiQA(word2vec=Word2Vec(),
                                    max_len=FLAGS.max_len_compare)
        compare_train_data.open_file(mode="train")
        compare_test_data = WikiQA(word2vec=Word2Vec(),
                                   max_len=FLAGS.max_len_compare)
        compare_test_data.open_file(mode="valid")

        trainX, trainY, = train
        testX, testY = test

        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)
        testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = Transformer(FLAGS.num_classes,
                            FLAGS.learning_rate,
                            FLAGS.batch_size,
                            FLAGS.decay_steps,
                            FLAGS.decay_rate,
                            FLAGS.sequence_length,
                            vocab_size,
                            FLAGS.embed_size,
                            FLAGS.d_model,
                            FLAGS.d_k,
                            FLAGS.d_v,
                            FLAGS.h,
                            FLAGS.num_layer,
                            FLAGS.is_training,
                            compare_train_data.num_features,
                            di=50,
                            s=compare_train_data.max_len,
                            w=4,
                            l2_reg=0.0004,
                            l2_lambda=FLAGS.l2_lambda)
        print("=" * 50)
        print("List of Variables:")
        for v in tf.trainable_variables():
            print(v.name)
        print("=" * 50)
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  #load pre-trained word embedding
                assign_pretrained_word_embedding(
                    sess,
                    vocabulary_index2word,
                    vocab_size,
                    model,
                    word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch = sess.run(model.epoch_step)
        number_of_training_data = len(trainX)
        print("number_of_training_data:", number_of_training_data)

        previous_eval_loss = 10000
        best_eval_loss = 10000
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            compare_train_data.reset_index()
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:", trainX[start:end])
                batch_x1, batch_x2, _, batch_features = compare_train_data.next_batch(
                    batch_size=end - start)
                feed_dict = {
                    model.input_x: trainX[start:end],
                    model.dropout_keep_prob: 0.9,
                    model.x1: batch_x1,
                    model.x2: batch_x2,
                    model.features: batch_features
                }
                feed_dict[model.input_y_label] = trainY[start:end]
                curr_loss, curr_acc, _ = sess.run(
                    [model.loss_val, model.accuracy, model.train_op],
                    feed_dict)  #curr_acc--->TextCNN.accuracy
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                if counter % 50 == 0:
                    print(
                        "transformer.classification==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, loss / float(counter),
                           acc / float(counter))
                    )  #tTrain Accuracy:%.3f---》acc/float(counter)
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################
                if FLAGS.batch_size != 0 and (
                        start % (FLAGS.validate_step * FLAGS.batch_size) == 0):
                    eval_loss, eval_acc = do_eval(sess, model, testX, testY,
                                                  compare_test_data,
                                                  batch_size)
                    print(
                        "transformer.classification.validation.part. previous_eval_loss:",
                        previous_eval_loss, ";current_eval_loss:", eval_loss)
                    if eval_loss > previous_eval_loss:  #if loss is not decreasing
                        # reduce the learning rate by a factor of 0.5
                        print(
                            "transformer.classification.==>validation.part.going to reduce the learning rate."
                        )
                        learning_rate1 = sess.run(model.learning_rate)
                        lrr = sess.run([model.learning_rate_decay_half_op])
                        learning_rate2 = sess.run(model.learning_rate)
                        print(
                            "transformer.classification==>validation.part.learning_rate1:",
                            learning_rate1, " ;learning_rate2:",
                            learning_rate2)
                    #print("HierAtten==>Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc))
                    else:  # loss is decreasing
                        if eval_loss < best_eval_loss:
                            print(
                                "transformer.classification==>going to save the model.eval_loss:",
                                eval_loss, ";best_eval_loss:", best_eval_loss)
                            # save model to checkpoint
                            save_path = FLAGS.ckpt_dir + "model.ckpt"
                            saver.save(sess, save_path, global_step=epoch)
                            best_eval_loss = eval_loss
                    previous_eval_loss = eval_loss
                    compare_test_data.reset_index()
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(model.epoch_increment)
Ejemplo n.º 57
0
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y,
    test_size=0.2, random_state=2017)

Y_train = to_categorical (Y_train, nb_classes = len (qualities))
Y_test = to_categorical (Y_test, nb_classes = len (qualities))

### Process vocabulary

print('Process vocabulary')

vocab_processor = tflearn.data_utils.VocabularyProcessor(max_document_length = model_size, min_frequency = 0)
X_train = np.array(list(vocab_processor.fit_transform(X_train)))
X_test = np.array(list(vocab_processor.fit_transform(X_test)))

X_train = pad_sequences(X_train, maxlen=model_size, value=0.)
X_test = pad_sequences(X_test, maxlen=model_size, value=0.)

n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)

# pickle.dump (X_train, open ("xtrain.p", b))
# pickle.dump (X_test, open ("xtest.p", b))

# X_train = pickle.load (open ("xtrain.p", rb))
# X_test = pickle.load (open ("xtest.p", rb))

### Models

print('Build model')
Ejemplo n.º 58
0
def load_data(data_path,vocab_role_1st_label2index,
              vocab_role_2nd_label2index,vocab_func_label2index,
              vocab_word2index,vocab_char2index,vocab_pos2index,
              vocab_cap2index,sentence_len,word_len,flag_use_char,
              flag_use_pos,flag_use_cap):
    """
    :param data_path:
    :param vocab_role_1st_label2index:
    :param vocab_role_2nd_label2index:
    :param vocab_func_label_2index:
    :param vocab_word2index:
    :param vocab_cahr2index:
    :param vocab_pos2index:
    :param vocab_cap2index:
    :param sentence_len:
    :param word_len:
    :param flag_use_char:
    :param flag_use_pos:
    :param flag_use_cap:
    :return:X: [ word_sequence, char_sequence, pos_sequence, cap_sequence]
                - word_sequence: sentence_len
                - char_sequence: sentence_len * word_len
                - pos_sequence: sentence_len
                - cap_sequence: sentence_len
    """
    data_file = codecs.open(data_path, mode='r', encoding='utf-8')
    data_lines = data_file.readlines()
    random.shuffle(data_lines)
    # build data samples:
    link_Y = []
    Word_sequences = []
    Char_sequences = []
    Pos_sequences = []
    Cap_sequences = []
    role_1st_labels = []
    role_2nd_labels = []
    func_labels = []
    for i, line in enumerate(data_lines):
        #raw_list = line.strip().split("\t")
        #print("====="*15)
        #print(raw_list)
        #print(raw_list[0])
        #print(raw_list[1])
        link_index, raw_list = int(i), line
        raw_list = raw_list.strip().split("__label__")
        input_list = raw_list[0].strip().split(" ")
        label_list = raw_list[1].split('|')
        # get labels
        link_Y.append(link_index)
        role_1st_label = vocab_role_1st_label2index[label_list[0]]
        # print("====="*15)
        # print(label_list[0])
        # print(role_1st_label)
        # exit()
        role_2nd_label = vocab_role_2nd_label2index[label_list[1]]
        func_label = vocab_func_label2index[label_list[2]]
        role_1st_labels.append(role_1st_label)
        role_2nd_labels.append(role_2nd_label)
        func_labels.append(func_label)
        # get word lists
        word_sequence = [vocab_word2index.get(x, UNK_ID) for x in input_list]
        #print(word_sequence)
        #exit()
        Word_sequences.append(word_sequence)
        # get char lists
        if flag_use_char:
            char_sequence = [] # [sentence_len, word_len]
            for word in input_list:
                char_indexs = [vocab_char2index.get(char, UNK_ID) for char in word]
                char_sequence.append(char_indexs)
            if len(input_list) < sentence_len:
                char_sequence.extend( [[0]] * (sentence_len-len(input_list)))
            else:
                char_sequence = char_sequence[:sentence_len]
            #print(input_list)
            #print(char_sequence)
            char_sequence = pad_sequences(char_sequence, maxlen=word_len, value=0.)
            #print(char_sequence[0])
            #print(char_sequence)
            #exit()
            Char_sequences.append(char_sequence)
        if flag_use_pos:
            pos_sequence = nltk.pos_tag(input_list) # [sentence_len]
            word_seq, pos_seq = zip(*pos_sequence)
            pos_sequence = list(pos_seq)
            pos_sequence = [vocab_pos2index.get(pos, UNK_ID) for pos in pos_sequence]
            Pos_sequences.append(pos_sequence)
        if flag_use_cap:
            cap_sequence = [word_capitalize(word) for word in input_list]
            cap_sequence = [vocab_cap2index[cap] for cap in cap_sequence]
            Cap_sequences.append(cap_sequence)

    Word_sequences = pad_sequences(Word_sequences, maxlen=sentence_len, value=0.)
    #print(Word_sequences)
    #exit()
    if flag_use_pos:
        Pos_sequences = pad_sequences(Pos_sequences, maxlen=sentence_len, value=0.)
    if flag_use_cap:
        Cap_sequences = pad_sequences(Cap_sequences, maxlen=sentence_len, value=0.)
    X = {'word':np.array(Word_sequences), 'char':np.array(Char_sequences), 'pos':np.array(Pos_sequences), 'cap':np.array(Cap_sequences),
         'role_1st':role_1st_labels, 'role_2nd':role_2nd_labels, 'func':func_labels}
    return (X, np.array(link_Y))
def main(_):
    #1.load data(X:list of lint,y:int).
    #if os.path.exists(FLAGS.cache_path):  # 如果文件系统中存在,那么加载故事(词汇表索引化的)
    #    with open(FLAGS.cache_path, 'r') as data_f:
    #        trainX, trainY, testX, testY, vocabulary_index2word=pickle.load(data_f)
    #        vocab_size=len(vocabulary_index2word)
    #else:
    if 1==1:
        trainX, trainY, testX, testY = None, None, None, None
        vocabulary_word2index, vocabulary_index2word = create_voabulary(word2vec_model_path=FLAGS.word2vec_model_path,name_scope="rcnn") #simple='simple'
        vocab_size = len(vocabulary_word2index)
        print("cnn_model.vocab_size:",vocab_size)
        vocabulary_word2index_label,vocabulary_index2word_label = create_voabulary_label(name_scope="rcnn")
        if FLAGS.multi_label_flag:
            FLAGS.traning_data_path='training-data/train-zhihu6-title-desc.txt' #test-zhihu5-only-title-multilabel.txt
        train, test, _ = load_data_multilabel_new(vocabulary_word2index, vocabulary_word2index_label,multi_label_flag=FLAGS.multi_label_flag,traning_data_path=FLAGS.traning_data_path) #,traning_data_path=FLAGS.traning_data_path
        trainX, trainY = train
        testX, testY = test
        # 2.Data preprocessing.Sequence padding
        print("start padding & transform to one hot...")
        trainX = pad_sequences(trainX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
        testX = pad_sequences(testX, maxlen=FLAGS.sequence_length, value=0.)  # padding to max length
        #with open(FLAGS.cache_path, 'w') as data_f: #save data to cache file, so we can use it next time quickly.
        #    pickle.dump((trainX,trainY,testX,testY,vocabulary_index2word),data_f)
        print("trainX[0]:", trainX[0]) #;print("trainY[0]:", trainY[0])
        # Converting labels to binary vectors
        print("end padding & transform to one hot...")
    #2.create session.
    config=tf.ConfigProto()
    config.gpu_options.allow_growth=True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        textRCNN=TextRCNN(FLAGS.num_classes, FLAGS.learning_rate, FLAGS.decay_steps, FLAGS.decay_rate,FLAGS.sequence_length,
                 vocab_size,FLAGS.embed_size,FLAGS.is_training,FLAGS.batch_size,multi_label_flag=FLAGS.multi_label_flag)
        #Initialize Save
        saver=tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir+"checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess,tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding: #load pre-trained word embedding
                assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size, textRCNN,word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch=sess.run(textRCNN.epoch_step)
        #3.feed data & training
        number_of_training_data=len(trainX)
        batch_size=FLAGS.batch_size
        for epoch in range(curr_epoch,FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size),range(batch_size, number_of_training_data, batch_size)):
                if epoch==0 and counter==0:
                    print("trainX[start:end]:",trainX[start:end])#;print("trainY[start:end]:",trainY[start:end])
                feed_dict = {textRCNN.input_x: trainX[start:end],textRCNN.dropout_keep_prob: 0.5}
                if not FLAGS.multi_label_flag:
                    feed_dict[textRCNN.input_y] = trainY[start:end]
                else:
                    feed_dict[textRCNN.input_y_multilabel]=trainY[start:end]
                curr_loss,curr_acc,_=sess.run([textRCNN.loss_val,textRCNN.accuracy,textRCNN.train_op],feed_dict) #curr_acc--->TextCNN.accuracy
                loss,counter,acc=loss+curr_loss,counter+1,acc+curr_acc
                if counter %50==0:
                    print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" %(epoch,counter,loss/float(counter),acc/float(counter))) #tTrain Accuracy:%.3f---》acc/float(counter)

            #epoch increment
            print("going to increment epoch counter....")
            sess.run(textRCNN.epoch_increment)

            # 4.validation
            print(epoch,FLAGS.validate_every,(epoch % FLAGS.validate_every==0))
            if epoch % FLAGS.validate_every==0:
                eval_loss, eval_acc=do_eval(sess,textRCNN,testX,testY,batch_size,vocabulary_index2word_label)
                print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch,eval_loss,eval_acc))
                #save model to checkpoint
                save_path=FLAGS.ckpt_dir+"model.ckpt"
                saver.save(sess,save_path,global_step=epoch)

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, test_acc = do_eval(sess, textRCNN, testX, testY, batch_size,vocabulary_index2word_label)
    pass