コード例 #1
0
def get_data(train_size, random_seed=100):
    #randomly shuffle train/test

    ip_text = read_data('/tmp/data/text.txt')
    op_text = read_data('/tmp/data/summary.txt')
    logger.info('Length of text: {}'.format(len(ip_text)))

    op_text = [
        'sos ' + sent[:-1] + 'eos .' if sent.endswith('.') else 'sos ' + sent +
        ' eos .' for sent in op_text
    ]

    np.random.seed(random_seed)
    inds = np.arange(len(ip_text))
    np.random.shuffle(inds)

    train_inds = inds[:train_size]
    test_inds = inds[train_size:]
    tr_ip_text = [ip_text[ti] for ti in train_inds]
    tr_op_text = [op_text[ti] for ti in train_inds]

    ts_ip_text = [ip_text[ti] for ti in test_inds]
    ts_op_text = [op_text[ti] for ti in test_inds]

    return tr_ip_text, tr_op_text, ts_ip_text, ts_op_text
コード例 #2
0
ファイル: main.py プロジェクト: lia-git/MedicalClass
 def deal_with_data(self):
     '''
     处理数据,没有可不写。
     :return:
     '''
     # 加载数据
     self.data = pd.read_csv(
         os.path.join(DATA_PATH, 'MedicalClass/train.csv'))
     # 划分训练集、测试集
     self.train_data, self.valid_data = train_test_split(self.data,
                                                         test_size=0.01,
                                                         random_state=6,
                                                         shuffle=True)
     self.text2id, _ = load_dict(
         os.path.join(DATA_PATH, 'MedicalClass/words_fr.dict'))
     # self.text2id, _ = load_dict(self.train_data)
     self.label2id, _ = load_labeldict(
         os.path.join(DATA_PATH, 'MedicalClass/label.dict'))
     self.train_text, self.train_label = read_data(self.train_data,
                                                   self.text2id,
                                                   self.label2id)
     self.val_text, self.val_label = read_data(self.valid_data,
                                               self.text2id, self.label2id)
     print('=*=数据处理完成=*=')
コード例 #3
0
def not_first_predict():
    ############# reading data  #################################################
    logger.info("Starting to read testing samples...")
    test_texts_1, test_texts_2, test_labels = data_helper.read_data(
        args.test_data_file)
    test_orig = pd.DataFrame({
        "question1": test_texts_1,
        "question2": test_texts_2
    })
    logger.info("Finish reading testing samples !")

    ###################### load a CSV file into DMatrix ######################
    # x_test = pd.read_csv(args.x_test_file, header=None, encoding="utf-8", sep="\t")
    # y_test = pd.read_csv(args.y_test_file, header=None, encoding="utf-8", sep="\t")
    # d_test = xgb.DMatrix(x_test, y_test)

    ###################### load a XGBoost binary file into DMatrix ######################

    d_test = xgb.DMatrix(args.dtest_file)

    ###################### do predict ###########################
    bst = xgb.Booster()  # init model
    bst.load_model(args.model_path)  # load model

    p_test = bst.predict(d_test, ntree_limit=args.ntree_limit)

    df_sub = pd.DataFrame({
        'user_query': test_texts_1,
        'candidate_query': test_texts_2,
        'label': test_labels,
        'score': p_test.ravel()
    })
    df_sub.to_csv(args.pred_data_file + str(args.ntree_limit),
                  header=False,
                  index=False,
                  encoding='utf-8',
                  sep="\t",
                  columns=['user_query', 'candidate_query', 'label', 'score'])
コード例 #4
0
ファイル: train_model.py プロジェクト: plmsmile/NLP-Demos
import torch.optim as optim

import data_helper as dh
from data_helper import get_variable
from model import *
from masked_cross_entropy import *
import show as sh
import train_helper as th

if __name__ == '__main__':
    data_dir = './data'
    en_file = "{}/{}".format(data_dir, "seg_en")
    zh_file = "{}/{}".format(data_dir, "seg_zh")
    TARGET_MAX_LEN = 25
    USE_CUDA = False
    pairs, input_lang, target_lang = dh.read_data(en_file, zh_file, 20000)

    # 模型配置
    encoder_bidir = False
    score_method = 'general'
    hidden_size = 500
    n_layers = 2
    dropout_p = 0.1
    batch_size = 50

    # 训练和优化配置
    clip = 50.0
    teacher_forcing_ratio = 0.5
    learning_rate = 0.0001
    decoder_learning_ratio = 5.0
    n_epochs = 20000
コード例 #5
0
    use_teacher_forcing = random.random() < 1
    for t in range(max_target_len):
        #(b,o)
        output, decoder_hidden, attn_weights = decoder(decoder_input,
                                                       decoder_hidden,
                                                       encoder_outputs)
        all_decoder_outputs[t] = output
        # 喂真实lable,应该喂output的结果
        if use_teacher_forcing:
            decoder_input = target_batches[t]
        else:
            # 从output中找到两个最符合的单词
            words = []
            for b in range(batch_size):
                topv, topi = output[b].data.topk(1)
                words.append(topi)
            decoder_input = get_variable(torch.LongTensor(words))

    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(),
        target_batches.transpose(0, 1).contiguous(), target_lengths)
    print(loss)


if __name__ == '__main__':
    data_dir = './data'
    en_file = "{}/{}".format(data_dir, "seg_en_30000.txt")
    zh_file = "{}/{}".format(data_dir, "seg_zh_30000.txt")
    pairs, input_lang, target_lang = helper.read_data(en_file, zh_file, 100)
    test_model(pairs, input_lang, target_lang)
コード例 #6
0
    def train(self):
        # Prepare data
        sentence_train, slot_train, sentence_dev, slot_dev, vocab_sentence,\
            vocab_slot = data_helper.prepare_data(
                                    "data",
                                    sentence_training_file,
                                    slot_training_file,
                                    sentence_developing_file,
                                    slot_developing_file,
                                    from_vocabulary_size=2000,
                                    to_vocabulary_size=2000,
                                    tokenizer=None)
        sentence_developing, slot_devloping = data_helper.read_data(
            sentence_dev, slot_dev, max_size=None)
        sentence_training, slot_training = data_helper.read_data(
            sentence_train, slot_train, max_size=None)

        ## TODO:
        #sentence_training, slot_training = sentence_training[:1000],\
        #    slot_training[:1000]

        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(vocab_slot)

        # For conlleval script
        words_train = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_training
        ]
        labels_train = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_training
        ]
        words_val = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_developing
        ]
        labels_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_devloping
        ]

        # Define model
        n_vocab = len(w2id_sentence)
        n_classes = len(w2id_slot)

        #model = Sequential()
        #model.add(Embedding(n_vocab,100))
        #model.add(Convolution1D(128, 5, border_mode='same', activation='relu'))
        #model.add(Dropout(0.25))
        #model.add(GRU(100,return_sequences=True))
        #model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
        #model.compile('rmsprop', 'categorical_crossentropy')

        ## Training
        ##n_epochs = 30
        #n_epochs = 1

        train_f_scores = []
        val_f_scores = []
        best_val_f1 = 0

        #print("Training =>")
        #train_pred_label = []
        #avgLoss = 0

        #for i in range(n_epochs):
        #    print("Training epoch {}".format(i))

        #    bar = progressbar.ProgressBar(max_value=len(sentence_training))
        #    for n_batch, sent in bar(enumerate(sentence_training)):
        #        label = slot_training[n_batch]
        #        # Make labels one hot
        #        label = np.eye(n_classes)[label][np.newaxis, :]
        #        # View each sentence as a batch
        #        sent = sent[np.newaxis, :]

        #        if sent.shape[1] > 1: #ignore 1 word sentences
        #            loss = model.train_on_batch(sent, label)
        #            avgLoss += loss

        #        pred = model.predict_on_batch(sent)
        #        pred = np.argmax(pred, -1)[0]
        #        train_pred_label.append(pred)

        #    avgLoss = avgLoss/n_batch

        #    predword_train = [list(map(lambda x: id2w_slot[x].decode('utf8'), y))
        #                      for y in train_pred_label]
        #    con_dict = conlleval(predword_train, labels_train,
        #                         words_train, 'measure.txt')
        #    train_f_scores.append(con_dict['f1'])
        #    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
        #        avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
        #    # Save model
        #    model.save(filepath_model)
        #    gc.collect()

        print("Validating =>")
        from keras.models import load_model
        model = load_model(filepath_model)

        labels_pred_val = []
        avgLoss = 0

        bar = progressbar.ProgressBar(max_value=len(sentence_developing))
        for n_batch, sent in bar(enumerate(sentence_developing)):
            label = slot_devloping[n_batch]
            label = np.eye(n_classes)[label][np.newaxis, :]
            sent = sent[np.newaxis, :]

            if sent.shape[1] > 1:  #some bug in keras
                loss = model.test_on_batch(sent, label)
                avgLoss += loss

            pred = model.predict_on_batch(sent)
            pred = np.argmax(pred, -1)[0]
            labels_pred_val.append(pred)

        avgLoss = avgLoss / n_batch
        gc.collect()

        predword_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in labels_pred_val
        ]
        con_dict = conlleval(predword_val, labels_val, words_val,
                             'measure.txt')
        val_f_scores.append(con_dict['f1'])
        print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
            avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

        if con_dict['f1'] > best_val_f1:
            best_val_f1 = con_dict['f1']
            print('here')
            with open('model_architecture.json', 'w') as outf:
                outf.write(model.to_json())
            model.save_weights('best_model_weights.h5', overwrite=True)
            print("Best validation F1 score = {}".format(best_val_f1))
        print()
コード例 #7
0
def train():
    # TODO: Saving DMatrix into a XGBoost binary file will make loading faster
    ################################################################
    ######################### reading data  ########################
    logger.info("Starting to read training samples...")
    train_texts_1, train_texts_2, labels = data_helper.read_data(args.train_data_file)
    val_texts_1, val_texts_2, val_labels = data_helper.read_data(args.valid_data_file)
    test_texts_1, test_texts_2, test_labels = data_helper.read_data(args.test_data_file)
    logger.info("Finish reading training samples !")

    ######################### load csv data ########################
    # # TODO: load a CSV file into DMatrix
    logger.info("Start loading csv.")
    x_train = pd.read_csv(args.x_train_file, header=None, encoding="utf-8", sep="\t")
    x_valid = pd.read_csv(args.x_valid_file, header=None, encoding="utf-8", sep="\t")
    x_test = pd.read_csv(args.x_test_file, header=None, encoding="utf-8", sep="\t")

    y_train = pd.read_csv(args.y_train_file, header=None, encoding="utf-8", sep="\t")
    y_valid = pd.read_csv(args.y_valid_file, header=None, encoding="utf-8", sep="\t")
    # y_test = pd.read_csv(args.y_test_file, header=None, encoding="utf-8", sep="\t")

    d_train = xgb.DMatrix(x_train, y_train)
    d_valid = xgb.DMatrix(x_valid, y_valid)
    logger.info("Done loading csv.")

    ########################## load DMatrix #########################
    # # TODO: load a XGBoost binary file into DMatrix
    # d_train = xgb.DMatrix(args.dtrain_file)
    # d_valid = xgb.DMatrix(args.dvalid_file)
    # d_test = xgb.DMatrix(args.dtest_file)

    ########################### train models ########################
    params = {
        "booster": args.booster,
        "eta": args.eta,
        "gamma": args.gamma,
        "max_depth": args.max_depth,
        "min_child_weight": args.min_child_weight,
        "max_delta_step": args.max_delta_step,
        "subsample": args.subsample,
        "colsample_bytree": args.colsample_bytree,
        "colsample_bylevel": args.colsample_bylevel,
        "lambda": args.lamda,
        "alpha": args.alpha,
        "scale_pos_weight": args.scale_pos_weight,
        "objective": args.objective,
        "eval_metric": list(args.eval_metric.split(","))
    }

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, args.num_boost_round, watchlist, early_stopping_rounds=args.early_stopping_rounds)
    bst.save_model(args.model_path)
    bst.dump_model(args.model_path + '.dump')

    ## make the submission
    p_test = bst.predict(xgb.DMatrix(x_test))
    df_sub = pd.DataFrame(
        {'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel()})
    df_sub.to_csv(args.pred_data_file,
                  header=False, index=False, encoding='utf-8', sep="\t",
                  columns=['user_query', 'candidate_query', 'label', 'score'])

    ## make the submission for best
    p_test = bst.predict(xgb.DMatrix(x_test), ntree_limit=bst.best_ntree_limit)
    df_sub = pd.DataFrame(
        {'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel()})
    df_sub.to_csv(args.pred_data_file + "_best",
                  header=False, index=False, encoding='utf-8', sep="\t",
                  columns=['user_query', 'candidate_query', 'label', 'score'])

    logger.info("best_iteration: {}".format(bst.best_iteration))
    logger.info("ntree_limit=bst.best_ntree_limit: {}".format(bst.best_ntree_limit))
    logger.info("best_score: {}".format(bst.best_score))
コード例 #8
0
def first_train():
    # TODO: Saving DMatrix into a XGBoost binary file will make loading faster
    #######################################################################

    logger.info("Starting to read Embedding file...")
    word2vec = common_function.load_word2vec(args.embedding_file, filter_num=args.embedding_dim)
    logger.info("Finish reading Embedding file !")
    logger.info('Found %d word vectors of word2vec' % len(word2vec))

    stop_words = common_function.load_file_2_dict(args.stopword_file, colum=1)
    logger.info("Finish reading stopword file !")
    logger.info('Stopword is : ' + "|".join(list(stop_words.keys())))

    ############################## reading data  #############################
    logger.info("Starting to read training samples...")
    train_texts_1, train_texts_2, labels = data_helper.read_data(args.train_data_file)
    val_texts_1, val_texts_2, val_labels = data_helper.read_data(args.valid_data_file)
    test_texts_1, test_texts_2, test_labels = data_helper.read_data(args.test_data_file)
    logger.info("Finish reading training samples !")

    train_orig = pd.DataFrame({"question1": train_texts_1, "question2": train_texts_2})
    val_orig = pd.DataFrame({"question1": val_texts_1, "question2": val_texts_2})
    test_orig = pd.DataFrame({"question1": test_texts_1, "question2": test_texts_2})

    ############################ save words counts ############################
    total_words = []
    ques = pd.concat([train_orig, val_orig], axis=0).reset_index(drop='index')
    for i in range(ques.shape[0]):
        total_words += ques.question1[i].split('|')
        total_words += ques.question2[i].split('|')

    ## save word freq to total_counts
    counts = Counter(total_words)
    r = open(args.word_counts_file, 'w', encoding="utf-8")
    for _word, _count in counts.items():
        r.write("%s\t%d\n" % (_word, _count))
    r.close()

    ############################## basic features  #############################
    train_cp = train_orig.copy()
    val_cp = val_orig.copy()
    test_cp = test_orig.copy()
    x_train_basic = data_helper.get_basic_feat(train_cp, args.embedding_dim, stop_words, word2vec)
    x_valid_basic = data_helper.get_basic_feat(val_cp, args.embedding_dim, stop_words, word2vec)
    x_test_basic = data_helper.get_basic_feat(test_cp, args.embedding_dim, stop_words, word2vec)

    ####################### sentence word char features #########################
    weights = {word: data_helper.get_weight(count) for word, count in counts.items()}
    x_train_more = data_helper.build_features(train_orig, stop_words, weights)
    x_valid_more = data_helper.build_features(val_orig, stop_words, weights)
    x_test_more = data_helper.build_features(test_orig, stop_words, weights)

    ######################## SCWLSTM model simscore #############################
    if args.use_scwlstm:
        x_train_sim = data_helper.model_simscore(args.train_scwlstm_pred_file, train_cp)
        x_valid_sim = data_helper.model_simscore(args.valid_scwlstm_pred_file, val_cp)
        x_test_sim = data_helper.model_simscore(args.test_scwlstm_pred_file, test_cp)

        ################### combine all features ##############################
        x_train = pd.concat((x_train_basic, x_train_more, x_train_sim), axis=1)
        x_valid = pd.concat((x_valid_basic, x_valid_more, x_valid_sim), axis=1)
        x_test = pd.concat((x_test_basic, x_test_more, x_test_sim), axis=1)
    else:
        x_train = pd.concat((x_train_basic, x_train_more), axis=1)
        x_valid = pd.concat((x_valid_basic, x_valid_more), axis=1)
        x_test = pd.concat((x_test_basic, x_test_more), axis=1)

    x_train.drop(['question1', 'question2'], axis=1, inplace=True)
    x_valid.drop(['question1', 'question2'], axis=1, inplace=True)
    x_test.drop(['question1', 'question2'], axis=1, inplace=True)

    # print(x_train.columns)

    features = [x for x in x_train.columns]
    data_helper.ceate_feature_map(args.feature_map_file, features)

    x_train.columns = [str(i) for i in range(x_train.shape[1])]
    x_valid.columns = [str(i) for i in range(x_valid.shape[1])]
    x_test.columns = [str(i) for i in range(x_test.shape[1])]

    ################################ save csv ###############################
    logger.info("Start saving csv.")
    x_train.to_csv(args.x_train_file, header=False, index=False, encoding="utf-8", sep="\t")
    x_valid.to_csv(args.x_valid_file, header=False, index=False, encoding="utf-8", sep="\t")
    x_test.to_csv(args.x_test_file, header=False, index=False, encoding="utf-8", sep="\t")

    y_train = pd.DataFrame(labels)
    y_valid = pd.DataFrame(val_labels)
    y_test = pd.DataFrame(test_labels)
    y_train.to_csv(args.y_train_file, header=False, index=False, encoding="utf-8", sep="\t")
    y_valid.to_csv(args.y_valid_file, header=False, index=False, encoding="utf-8", sep="\t")
    y_test.to_csv(args.y_test_file, header=False, index=False, encoding="utf-8", sep="\t")
    logger.info("Done saving csv.")

    ############################# save DMatrix ################################
    logger.info("Start saving DMatrix.")
    y_train = labels
    y_valid = val_labels
    y_test = test_labels
    d_train = xgb.DMatrix(x_train, label=labels)
    d_valid = xgb.DMatrix(x_valid, label=val_labels)
    d_test = xgb.DMatrix(x_test, label=test_labels)
    d_train.save_binary(args.dtrain_file, silent=False)
    d_valid.save_binary(args.dvalid_file, silent=False)
    d_test.save_binary(args.dtest_file, silent=False)
    # logger.info(d_train.feature_names)
    logger.info("Done saving DMatrix.")

    ############################# train models #################################
    params = {
        "booster": args.booster,
        "eta": args.eta,
        "gamma": args.gamma,
        "max_depth": args.max_depth,
        "min_child_weight": args.min_child_weight,
        "max_delta_step": args.max_delta_step,
        "subsample": args.subsample,
        "colsample_bytree": args.colsample_bytree,
        "colsample_bylevel": args.colsample_bylevel,
        "lambda": args.lamda,
        "alpha": args.alpha,
        "scale_pos_weight": args.scale_pos_weight,
        "objective": args.objective,
        "eval_metric": list(args.eval_metric.split(","))
    }

    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    bst = xgb.train(params, d_train, args.num_boost_round, watchlist, early_stopping_rounds=args.early_stopping_rounds)
    bst.save_model(args.model_path)
    bst.dump_model(args.model_path + '.dump')

    ## make the submission
    p_test = bst.predict(xgb.DMatrix(x_test))
    df_sub = pd.DataFrame(
        {'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel()})
    df_sub.to_csv(args.pred_data_file,
                  header=False, index=False, encoding='utf-8', sep="\t",
                  columns=['user_query', 'candidate_query', 'label', 'score'])

    ## make the submission for best
    p_test = bst.predict(xgb.DMatrix(x_test), ntree_limit=bst.best_ntree_limit)
    df_sub = pd.DataFrame(
        {'user_query': test_texts_1, 'candidate_query': test_texts_2, 'label': test_labels, 'score': p_test.ravel()})
    df_sub.to_csv(args.pred_data_file + "_best",
                  header=False, index=False, encoding='utf-8', sep="\t",
                  columns=['user_query', 'candidate_query', 'label', 'score'])

    logger.info("best_iteration: {}".format(bst.best_iteration))
    logger.info("ntree_limit=bst.best_ntree_limit: {}".format(bst.best_ntree_limit))
    logger.info("best_score: {}".format(bst.best_score))
コード例 #9
0
def first_predict():
    # TODO: build features the first time to predict
    #################################################################
    logger.info("Starting to read Embedding file...")
    word2vec = common_function.load_word2vec(args.embedding_file,
                                             filter_num=args.embedding_dim)
    logger.info("Finish reading Embedding file !")
    logger.info('Found %d word vectors of word2vec' % len(word2vec))

    stop_words = common_function.load_file_2_dict(args.stopword_file, colum=1)
    logger.info("Finish reading stopword file !")
    logger.info('Stopword is : ' + "|".join(list(stop_words.keys())))

    ############# reading data  #################################################
    logger.info("Starting to read testing samples...")
    test_texts_1, test_texts_2, test_labels = data_helper.read_data(
        args.test_data_file)
    test_orig = pd.DataFrame({
        "question1": test_texts_1,
        "question2": test_texts_2
    })
    logger.info("Finish reading testing samples !")

    ############### read words counts #########################################
    counts = common_function.load_file_2_dict(args.word_counts_file)
    weights = {
        word: data_helper.get_weight(int(count))
        for word, count in counts.items()
    }

    ################ make features  ########################################
    test_cp = test_orig.copy()
    x_test_basic = data_helper.get_basic_feat(test_cp, args.embedding_dim,
                                              stop_words, word2vec)
    x_test_more = data_helper.build_features(test_orig, stop_words, weights)
    if args.use_scwlstm:
        x_test_sim = data_helper.model_simscore(args.test_scwlstm_pred_file,
                                                test_cp)

        ############## combine all features ########################################
        x_test = pd.concat([x_test_basic, x_test_more, x_test_sim], axis=1)
    else:
        x_test = pd.concat((x_test_basic, x_test_more), axis=1)

    x_test.drop(['question1', 'question2'], axis=1, inplace=True)

    x_test.columns = [str(i) for i in range(x_test.shape[1])]

    ################ save DMatrix binary data to make loading faster #########
    xgb.DMatrix(x_test).save_binary('test.buffer')

    ############## predict models ################################################
    bst = xgb.Booster()  # init model
    bst.load_model(args.model_path)  # load model

    p_test = bst.predict(xgb.DMatrix(x_test), ntree_limit=args.ntree_limit)

    df_sub = pd.DataFrame({
        'user_query': test_texts_1,
        'candidate_query': test_texts_2,
        'label': test_labels,
        'score': p_test.ravel()
    })
    df_sub.to_csv(args.pred_data_file + str(args.ntree_limit),
                  header=False,
                  index=False,
                  encoding='utf-8',
                  sep="\t",
                  columns=['user_query', 'candidate_query', 'label', 'score'])
コード例 #10
0
tf.flags.DEFINE_integer("decay_steps", 100, "decay steps")
tf.flags.DEFINE_integer("decay_steps1", 100, "decay steps")
tf.flags.DEFINE_float("decay_rate", 0.1, "decay rate")
tf.flags.DEFINE_float("lr", 0.001, "learning rate1")
tf.flags.DEFINE_string("cells_sizes", '80,64',
                       "numbers of cells of each layer")
tf.flags.DEFINE_integer("display_step", 50, "display_step")
tf.flags.DEFINE_integer("save_step", 100, "save_step")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
for attr, value in FLAGS.__flags.items():
    print("attr:%s\tvalue:%s" % (attr, str(value)))
# load data
print("Loading data")
starttime = datetime.datetime.now()
sequences = data_helper.read_data(
    os.path.abspath(os.path.join(os.path.curdir, "data")), FLAGS.n_inputs)
endtime = datetime.datetime.now()
print(str((endtime - starttime).seconds), "seconds")
validation_sequence = sequences.validation.sequence.reshape(
    (-1, FLAGS.n_steps, FLAGS.n_inputs))
test_sequence = sequences.test.sequence.reshape(
    (-1, FLAGS.n_steps, FLAGS.n_inputs))
validation_iter = math.ceil(validation_sequence.shape[0] / FLAGS.batch_size)
print("validation_iter:", validation_iter)
test_iter = math.ceil(test_sequence.shape[0] / FLAGS.batch_size)

boundaries = [50, 100, 150, 200, 300, 400, 500]
learning_rates = [0.001, 0.0001, 0.00001, 0.000001, 0.0000001]
#learning_rates2=[0.0001,0.00008,0.00001,0.000008,0.000001]

# train
コード例 #11
0
    def train(self):
        sentence_developing, slot_devloping = data_helper.read_data(
            self.sentence_dev, self.slot_dev, max_size=None)
        sentence_training, slot_training = data_helper.read_data(
            self.sentence_train, self.slot_train, max_size=None)

        # Make toy data; comment this block to train on the full dataset
        #n_toy = 1000
        #sentence_training, slot_training = sentence_training[:n_toy],\
        #    slot_training[:n_toy]
        #sentence_developing, slot_devloping = sentence_developing[:round(n_toy/2)],\
        #    slot_devloping[:round(n_toy/2)]

        # Dictionaries
        w2id_sentence, id2w_sentence = data_helper.initialize_vocabulary(
            self.vocab_sentence)
        w2id_slot, id2w_slot = data_helper.initialize_vocabulary(
            self.vocab_slot)

        # For conlleval script
        words_train = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_training
        ]
        labels_train = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_training
        ]
        words_val = [
            list(map(lambda x: id2w_sentence[x].decode('utf8'), w))
            for w in sentence_developing
        ]
        labels_val = [
            list(map(lambda x: id2w_slot[x].decode('utf8'), y))
            for y in slot_devloping
        ]

        # Define model
        n_vocab = len(w2id_sentence)
        n_classes = len(w2id_slot)

        model = Sequential()
        model.add(Embedding(n_vocab, 100))
        model.add(Convolution1D(128, 5, border_mode='same', activation='relu'))
        model.add(Dropout(0.25))
        model.add(GRU(100, return_sequences=True))
        model.add(TimeDistributed(Dense(n_classes, activation='softmax')))
        model.compile('rmsprop', 'categorical_crossentropy')

        # Training
        #n_epochs = 30
        n_epochs = 1

        train_f_scores = []
        val_f_scores = []
        best_val_f1 = 0

        print("Training =>")
        train_pred_label = []
        avgLoss = 0

        for i in range(n_epochs):
            print("Training epoch {}".format(i))

            bar = progressbar.ProgressBar(max_value=len(sentence_training))
            for n_batch, sent in bar(enumerate(sentence_training)):
                label = slot_training[n_batch]
                # Make labels one hot
                label = np.eye(n_classes)[label][np.newaxis, :]
                # View each sentence as a batch
                sent = sent[np.newaxis, :]

                if sent.shape[1] > 1:  #ignore 1 word sentences
                    loss = model.train_on_batch(sent, label)
                    avgLoss += loss

                pred = model.predict_on_batch(sent)
                pred = np.argmax(pred, -1)[0]
                train_pred_label.append(pred)

            avgLoss = avgLoss / n_batch

            predword_train = [
                list(map(lambda x: id2w_slot[x].decode('utf8'), y))
                for y in train_pred_label
            ]
            con_dict = conlleval(predword_train, labels_train, words_train,
                                 'measure.txt')
            train_f_scores.append(con_dict['f1'])
            print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
                avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))
            # Save model
            model.save(model_file)

            print("Validating =>")

            labels_pred_val = []
            avgLoss = 0

            bar = progressbar.ProgressBar(max_value=len(sentence_developing))
            for n_batch, sent in bar(enumerate(sentence_developing)):
                label = slot_devloping[n_batch]
                label = np.eye(n_classes)[label][np.newaxis, :]
                sent = sent[np.newaxis, :]

                if sent.shape[1] > 1:  #some bug in keras
                    loss = model.test_on_batch(sent, label)
                    avgLoss += loss

                pred = model.predict_on_batch(sent)
                pred = np.argmax(pred, -1)[0]
                labels_pred_val.append(pred)

            avgLoss = avgLoss / n_batch

            predword_val = [
                list(map(lambda x: id2w_slot[x].decode('utf8'), y))
                for y in labels_pred_val
            ]
            con_dict = conlleval(predword_val, labels_val, words_val,
                                 'measure.txt')
            val_f_scores.append(con_dict['f1'])
            print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(
                avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

            if con_dict['f1'] > best_val_f1:
                best_val_f1 = con_dict['f1']
                with open('model_architecture.json', 'w') as outf:
                    outf.write(model.to_json())
                model.save_weights('best_model_weights.h5', overwrite=True)
                print("Best validation F1 score = {}".format(best_val_f1))
            print()

            # Prevent from tensorflow bugs: BaseSession.__del__
            gc.collect()
コード例 #12
0
    
    use_teacher_forcing = random.random() < 1
    for t in range(max_target_len):
        #(b,o)
        output, decoder_hidden, attn_weights = decoder(decoder_input, decoder_hidden, encoder_outputs)
        all_decoder_outputs[t] = output
         # 喂真实lable,应该喂output的结果
        if use_teacher_forcing:
            decoder_input = target_batches[t]
        else:
            # 从output中找到两个最符合的单词
            words = []
            for b in range(batch_size):
                topv, topi = output[b].data.topk(1)
                words.append(topi)
            decoder_input = get_variable(torch.LongTensor(words))
    
    loss = masked_cross_entropy(
        all_decoder_outputs.transpose(0, 1).contiguous(),
        target_batches.transpose(0, 1).contiguous(),
        target_lengths
    )
    print (loss)
    

if __name__ == '__main__':
    data_dir = './data'
    en_file = "{}/{}".format(data_dir, "seg_en")
    zh_file = "{}/{}".format(data_dir, "seg_zh")
    input_lang, target_lang, pairs = helper.read_data(en_file, zh_file, 20000)
    test_model(pairs, input_lang, target_lang)
コード例 #13
0
ファイル: train.py プロジェクト: siyuanzhao/b_kaggle
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ===================================

print 'loading data...'

data_file = 'train_pivot.csv'

product_l, data = data_helper.read_data(data_file)

data_size = data.shape[0]
product_num = len(product_l)

epoch_steps = data_size / FLAGS.batch_size
# turn to pandas dataframe
product_l = pd.DataFrame(product_l)
product_l.columns = ['Producto_ID']
product_l['index1'] = product_l.index

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
    sess = tf.Session(config=session_conf)
    rnn = BimboRNN(FLAGS.batch_size, FLAGS.embedding_size, product_num, FLAGS.hidden_size, 6)
    with sess.as_default():