Beispiel #1
0
 def __init__(self):
     (self.sentence_train, self.slot_train, self.sentence_dev,
      self.slot_dev, self.vocab_sentence,
      self.vocab_slot) = data_helper.prepare_data("data",
                                                  sentence_training_file,
                                                  slot_training_file,
                                                  sentence_developing_file,
                                                  slot_developing_file,
                                                  from_vocabulary_size=2000,
                                                  to_vocabulary_size=2000,
                                                  tokenizer=None)
Beispiel #2
0
def main():
    train_data = [data1, data2, data3, data4]
    test_data = [data5]
    lang = Lang()
    arg = ARG()
    lang.insert_data(train_data)
    train_data_all = prepare_data(train_data, lang.char2idx, arg)
    test_data_all = prepare_data(test_data, lang.char2idx, arg)
    # print(lang.char2idx)

    # prepare for train
    model = RNNReader(arg, len(lang.char2idx))
    optimer = optim.SGD(model.parameters(), lr=1e-4)
    # train model
    model_file = 'model_1.pkl'
    if os.path.exists(model_file):
        model = torch.load(model_file)
    else:
        train(model, optimer, train_data_all, test_data_all, test_data)
        torch.save(model, model_file)
    predict(model, test_data_all, test_data)
    def train(self):
        # Prepare data
        sentence_train, slot_train, sentence_dev, slot_dev, vocab_sentence,\
            vocab_slot = data_helper.prepare_data(
                                    "data",
                                    sentence_training_file,
                                    slot_training_file,
                                    sentence_developing_file,
                                    slot_developing_file,
                                    from_vocabulary_size=2000,
                                    to_vocabulary_size=2000,
                                    tokenizer=None)
        sentence_developing, slot_devloping = data_helper.read_data(
            sentence_dev, slot_dev, max_size=None)
        sentence_training, slot_training = data_helper.read_data(
            sentence_train, slot_train, max_size=None)

        ## TODO:
        #sentence_training, slot_training = sentence_training[:1000],\
        #    slot_training[:1000]

        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(vocab_slot)

        # For conlleval script
        words_train = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_training
        ]
        labels_train = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_training
        ]
        words_val = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_developing
        ]
        labels_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_devloping
        ]

        # Define model
        n_vocab = len(w2id_sentence)
        n_classes = len(w2id_slot)

        #model = Sequential()
        #model.add(Embedding(n_vocab,100))
        #model.add(Convolution1D(128, 5, border_mode='same', activation='relu'))
        #model.add(Dropout(0.25))
        #model.add(GRU(100,return_sequences=True))
        #model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
        #model.compile('rmsprop', 'categorical_crossentropy')

        ## Training
        ##n_epochs = 30
        #n_epochs = 1

        train_f_scores = []
        val_f_scores = []
        best_val_f1 = 0

        #print("Training =>")
        #train_pred_label = []
        #avgLoss = 0

        #for i in range(n_epochs):
        #    print("Training epoch {}".format(i))

        #    bar = progressbar.ProgressBar(max_value=len(sentence_training))
        #    for n_batch, sent in bar(enumerate(sentence_training)):
        #        label = slot_training[n_batch]
        #        # Make labels one hot
        #        label = np.eye(n_classes)[label][np.newaxis, :]
        #        # View each sentence as a batch
        #        sent = sent[np.newaxis, :]

        #        if sent.shape[1] > 1: #ignore 1 word sentences
        #            loss = model.train_on_batch(sent, label)
        #            avgLoss += loss

        #        pred = model.predict_on_batch(sent)
        #        pred = np.argmax(pred, -1)[0]
        #        train_pred_label.append(pred)

        #    avgLoss = avgLoss/n_batch

        #    predword_train = [list(map(lambda x: id2w_slot[x].decode('utf8'), y))
        #                      for y in train_pred_label]
        #    con_dict = conlleval(predword_train, labels_train,
        #                         words_train, 'measure.txt')
        #    train_f_scores.append(con_dict['f1'])
        #    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
        #        avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
        #    # Save model
        #    model.save(filepath_model)
        #    gc.collect()

        print("Validating =>")
        from keras.models import load_model
        model = load_model(filepath_model)

        labels_pred_val = []
        avgLoss = 0

        bar = progressbar.ProgressBar(max_value=len(sentence_developing))
        for n_batch, sent in bar(enumerate(sentence_developing)):
            label = slot_devloping[n_batch]
            label = np.eye(n_classes)[label][np.newaxis, :]
            sent = sent[np.newaxis, :]

            if sent.shape[1] > 1:  #some bug in keras
                loss = model.test_on_batch(sent, label)
                avgLoss += loss

            pred = model.predict_on_batch(sent)
            pred = np.argmax(pred, -1)[0]
            labels_pred_val.append(pred)

        avgLoss = avgLoss / n_batch
        gc.collect()

        predword_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in labels_pred_val
        ]
        con_dict = conlleval(predword_val, labels_val, words_val,
                             'measure.txt')
        val_f_scores.append(con_dict['f1'])
        print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
            avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

        if con_dict['f1'] > best_val_f1:
            best_val_f1 = con_dict['f1']
            print('here')
            with open('model_architecture.json', 'w') as outf:
                outf.write(model.to_json())
            model.save_weights('best_model_weights.h5', overwrite=True)
            print("Best validation F1 score = {}".format(best_val_f1))
        print()
Beispiel #4
0
    saver = tf.train.Saver(variables_to_restore)
    for name in variables_to_restore:
        print(name)

    @log_time_delta
    def predict(model, sess, batch, test):
        scores = []
        for data in batch:
            score = model.predict(sess, data)
            scores.extend(score)
        return np.array(scores[:len(test)])

    text = "怎么 提取 公积金 ?"

    splited_text = data_helper.encode_to_split(text, alphabet)

    mb_q, mb_q_mask = data_helper.prepare_data([splited_text])
    mb_a, mb_a_mask = data_helper.prepare_data([splited_text])

    data = (mb_q, mb_a, mb_q_mask, mb_a_mask)
    score = model.predict(sess, data)
    print(score)
    feed_dict = {
        model.question: data[0],
        model.answer: data[1],
        model.q_mask: data[2],
        model.a_mask: data[3],
        model.dropout_keep_prob_holder: 1.0
    }
    sess.run(model.position_embedding, feed_dict=feed_dict)[0]