コード例 #1
0
def load_data_set(mode, feature_mode, seed=3):
    # 抽样生成训练集、测试集
    # mode 0 - nqso是s82 std star的
    # mode 1 - nqso是做了iband filter并附加了dr7 quasar catalog
    qso_file = './train/QSO_sample_data' + str(mode)
    nqso_file = './train/nQSO_sample_data' + str(mode)
    QSO_data, QSO_label, _ = load_data(qso_file, feature_mode)
    print("total QSO: ", len(QSO_label))
    nQSO_data, nQSO_label, _ = load_data(nqso_file, feature_mode)
    print("total nQSO: ", len(nQSO_label))

    min_size = min(len(QSO_label), len(nQSO_label))
    train_size = int(min_size * 5 / 6 - 1)
    test_size = int(min_size / 6 - 1)
    print("Training Set Size: ", train_size)
    print("Testing Set Size: ", test_size)
    print()

    # 生成random sampling的数据集
    # 训练集 / 测试集 = 5 : 1
    rnd_train_QSO_data, rnd_train_QSO_label, rnd_test_QSO_data, rnd_test_QSO_label \
        = rnd_sampling(QSO_data, QSO_label, train_size, test_size, seed)
    rnd_train_nQSO_data, rnd_train_nQSO_label, rnd_test_nQSO_data, rnd_test_nQSO_label \
        = rnd_sampling(nQSO_data, nQSO_label, train_size, test_size, seed)

    train_data = rnd_train_QSO_data + rnd_train_nQSO_data
    train_label = rnd_train_QSO_label + rnd_train_nQSO_label
    test_data = rnd_test_QSO_data + rnd_test_nQSO_data
    test_label = rnd_test_QSO_label + rnd_test_nQSO_label

    return train_data, train_label, test_data, test_label
コード例 #2
0
def imbalance_data_set(mode, train_ratio, test_ratio, seed):
    # return imbalance train and test data set
    QSO_data, QSO_label, _ = load_data(filename='./train/QSO_sample_data3',
                                       mode='all')
    nQSO_data, nQSO_label, _ = load_data(filename='./train/nQSO_sample_data3',
                                         mode='all')
    rnd_train_QSO_data, rnd_train_QSO_label, rnd_test_QSO_data, rnd_test_QSO_label \
            = rnd_sampling(QSO_data, QSO_label, 1600, 400, seed)
    rnd_train_nQSO_data, rnd_train_nQSO_label, rnd_test_nQSO_data, rnd_test_nQSO_label \
            = rnd_sampling(nQSO_data, nQSO_label, int(1600*train_ratio), int(400*test_ratio), seed)
    train_data = rnd_train_QSO_data + rnd_train_nQSO_data
    train_label = rnd_train_QSO_label + rnd_train_nQSO_label
    test_data = rnd_test_QSO_data + rnd_test_nQSO_data
    test_label = rnd_test_QSO_label + rnd_test_nQSO_label
    return train_data, train_label, test_data, test_label
コード例 #3
0
def do_predict(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    # Initialize model
    model = NerBiLstmModel(helper, config, embeddings)
    model.to(config.device)

    # Load data
    helper, data = load_data(args, helper)
    examples = data['examples']

    # Preprocess data
    data_preprocessor = DataPreprocessor(model, config, helper)
    examples = data_preprocessor.preprocess_sequence_data(examples)

    with torch.no_grad():
        model.load_state_dict(torch.load(config.model_output))
        model.eval()

        predictor = Predictor(model, config)
        output = predictor.predict(examples, use_str_labels=True)
        sentences, labels, predictions = zip(*output)
        predictions = [[LBLS[l] for l in preds] for preds in predictions]
        output = list(zip(sentences, labels, predictions))

        for sentence, labels, predictions in output:
            print_sentence(args.output, sentence, labels, predictions)
コード例 #4
0
def main(_):
    console.start('mlp task')

    # Configurations
    th = NlsHub(as_global=True)
    th.memory_depth = 6
    th.num_blocks = 2
    th.multiplier = 2
    th.hidden_dim = th.memory_depth * th.multiplier
    # th.actype1 = 'lrelu'   # Default: relu

    th.epoch = 10
    th.batch_size = 32
    th.learning_rate = 1e-4
    th.validation_per_round = 5
    th.print_cycle = 100

    th.train = True
    # th.smart_train = True
    # th.max_bad_apples = 4
    # th.lr_decay = 0.6

    th.early_stop = True
    th.idle_tol = 20
    th.save_mode = SaveMode.NAIVE
    # th.warm_up_thres = 1
    # th.at_most_save_once_per_round = True

    th.overwrite = True
    th.export_note = True
    th.summary = False
    # th.monitor = True
    th.save_model = True

    th.allow_growth = False
    th.gpu_memory_fraction = 0.40

    description = '0'
    th.mark = 'mlp-{}x({}x{})-{}'.format(th.num_blocks, th.memory_depth,
                                         th.multiplier, description)
    # Get model
    model = nlsf_model_lib.mlp_00(th)
    # Load data
    train_set, val_set, test_set = load_data(th.data_dir,
                                             depth=th.memory_depth)
    assert isinstance(train_set, DataSet)
    assert isinstance(val_set, DataSet)
    assert isinstance(test_set, DataSet)

    # Train or evaluate
    if th.train:
        model.nn.train(train_set, validation_set=val_set, trainer_hub=th)
    else:
        console.show_status('Evaluating ...')
        model.evaluate(train_set, start_at=th.memory_depth)
        model.evaluate(val_set, start_at=th.memory_depth)
        model.evaluate(test_set, start_at=th.memory_depth, plot=True)

    # End
    console.end()
コード例 #5
0
def train():
    train_dataset, val_dataset = load_data()

    model = SlotRNN(train_dataset.vocab_size, embedding_size, train_dataset.n_classes)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    print (model)

    for epoch in range(n_epochs):
        # get batch data
        print_loss = 0
        train_pred_label = []
        for data_x, data_y in train_dataset:
            # zero_grad
            optimizer.zero_grad()
            #forward
            pred = model(data_x)
            train_pred_label.append(var2np(pred))
            # compute loss
            loss = criterion(pred, data_y)
            print_loss += loss.data[0]
            # backward
            loss.backward()
            optimizer.step()

        # print ('epoch: (%d / %d) loss: %.4f' % (epoch+1, n_epochs, print_loss/len(train_dataset)))
        train_pred = [list(map(lambda x: train_dataset.idx2labels[x], y)) for y in train_pred_label]
        eval(model, train_dataset, train_pred)
        eval(model, val_dataset)
コード例 #6
0
    def test(self, name):
        '''
        Returns the top-1 accuracy on an unseen test dataset.
        '''

        loader, iteration = data_util.load_data(partition='test')

        data_iter = data_util.inf_generator(loader)
        results = []
        for i in range(iteration):
            X, y = data_iter.__next__()

            X = [x.numpy()[0] for x in X]

            predX = Variable(torch.FloatTensor([X]),
                             requires_grad=True).to(device)
            y = Variable(torch.LongTensor([y]), requires_grad=False).to(device)

            y_pred = self.model(predX)

            results.append(
                [y.cpu().numpy()[0],
                 y_pred.max(-1)[1].cpu().numpy()[0]])
        pd.DataFrame(results,
                     columns=['y_true', 'y_pred'
                              ]).to_csv(f'../model_train_results/{name}.csv',
                                        index=False)
コード例 #7
0
def play(partial_full_data, train_weights=True, output_file=None):

    init_weights = current_weights()

    if train_weights:
        train_dev, _ = data_util.load_data()

        X = train_dev.positive - train_dev.negative
        clf = linear.train_model(X)

        # columns where all X[i] are zero
        unused_features = np.nonzero(np.sum(np.abs(X), axis=0) == 0)[0]
        # if a feature is not used, its weight is 0
        learnt_weights = [
            int(x * 1000) if (i not in unused_features) else None
            for i, x in enumerate(clf.coef_[0])
        ]

        weights = {}
        for i, k in enumerate(init_weights):
            if learnt_weights[i] is not None:
                weights[k] = learnt_weights[i]
            else:
                weights[k] = 10000 + init_weights[k]
    else:
        weights = init_weights

    pairs = generate_visual_pairs(partial_full_data, weights)

    if output_file is not None:
        with open(output_file, "w+") as f:
            print(f"Writing pairs to {output_file}")
            json.dump(pairs, f)
    else:
        print(json.dumps(pairs))
コード例 #8
0
    def test(self, name):
        '''
        Takes a supposedly unseen dataset and finds the top-1 predicted labels.
        Stores those values in a csv file called name.csv, where name is the value
        set for the name parameter for this function.
        '''
        loader, iteration = data_util.load_data(partition='test')
        #iteration = 1
        data_iter = data_util.inf_generator(loader)
        results = []
        for i in range(iteration):
            X, y = data_iter.__next__()

            X = [x.numpy()[0] for x in X]

            predX = Variable(torch.FloatTensor([X]),
                             requires_grad=True).to(device)
            y = Variable(torch.LongTensor([y]), requires_grad=False).to(device)

            y_pred = self.model(predX)

            results.append(
                [y.cpu().numpy()[0],
                 y_pred.max(-1)[1].cpu().numpy()[0]])
        pd.DataFrame(results,
                     columns=['y_true', 'y_pred'
                              ]).to_csv(f'../model_train_results/{name}.csv',
                                        index=False)
コード例 #9
0
def model():
    all_data = load_data()
    helper = all_data[0]  # helper[0],即tok2id——字典(词——id),helper[1]——训练集里的最大句子长度
    train_raw, dev_raw, test_raw = all_data[1]  # 原始字符表示[([词1,词2...],[标签1,标签2...]),()]
    train_vec, dev_vec, test_vec = all_data[2]  # 字符转数字[([[id1], [id2]...], [0,1,0,2,3...]),()]
    train_set, dev_set, test_set = all_data[3]  # 每个词取窗口内的词,每个句子padding为定长

    with tf.Graph().as_default():
        logger.info("建立模型...")

        # 生成placeholders
        input_placeholder, labels_placeholder, mask_placeholder = add_placeholders()

        # 推断,得到训练预测值
        pred = inference_op(helper, input_placeholder)

        # saver
        saver = tf.train.Saver()  # 这个是模型的存储

        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            sess.run(init)  # 初始化

            saver.restore(sess, model_output)  # 加载模型
            print("模型已加载...")

            start_time = time.time()

            # 在测试集上预测
            result = evaluate(sess, test_set, test_raw, test_vec, pred,
                              input_placeholder, mask_placeholder, labels_placeholder)

            print("预测耗时:{}".format(time.time() - start_time))
            with open("result/test_pred.json", "w+") as f:
                write_txt(f, result)
コード例 #10
0
def do_evaluate(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    # Initialize model
    model = NerBiLstmModel(helper, config, embeddings)
    model.to(config.device)

    # Load data
    helper, data = load_data(args, helper)
    examples = data['examples']

    # Preprocess data
    data_preprocessor = DataPreprocessor(model, config, helper)
    examples = data_preprocessor.preprocess_sequence_data(examples)

    with torch.no_grad():
        model.load_state_dict(torch.load(config.model_output))
        model.eval()

        evaluator = Evaluator(Predictor(model, config))
        token_cm, entity_scores = evaluator.evaluate(examples)
        print("Token-level confusion matrix:\n" + token_cm.as_table())
        print("Token-level scores:\n" + token_cm.summary())
        print("Entity level P/R/F1: {:.2f}/{:.2f}/{:.2f}".format(*entity_scores))
コード例 #11
0
def do_training_test(args):
    logger.info("Testing implementation of NerBiLstmModel")
    torch.manual_seed(133)
    # Set up configuration and output
    config = Config(args)
    config.n_epochs = 1
    config.model_output = None

    # Load data
    helper, data = load_data(args)
    train_examples = data['train_examples']
    dev_examples = data['dev_examples']

    # Load embeddings
    embeddings = load_embeddings(args, helper, config.device)

    # Initialize model
    model = NerBiLstmModel(helper, config, embeddings)
    model.to(config.device)

    # Preprocess data
    data_preprocessor = DataPreprocessor(model, config, helper)
    train_examples = data_preprocessor.preprocess_sequence_data(train_examples)
    dev_examples = data_preprocessor.preprocess_sequence_data(dev_examples)

    # Start training
    trainer = Trainer(model, config, helper, logger)
    logger.info("Starting training...",)
    trainer.train(train_examples, dev_examples)

    logger.info("Model did not crash!")
    logger.info("Passed!")
コード例 #12
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='lstm')
    parser.add_argument('--max_len', type=int, default='32')
    parser.add_argument('--data_name', type=str, default='nsmc')
    args = parser.parse_args()

    MODEL_NAME = args.model_name
    MAX_LEN = args.max_len
    DATA_NAME = args.data_name

    # logging
    log = config.logger
    folder_path = config.folder_path
    if os.path.exists(folder_path) == False:
        os.makedirs(folder_path)
    fileHandler = logging.FileHandler(
        os.path.join(
            folder_path, config.current_time + '-' + MODEL_NAME + '-' +
            str(MAX_LEN) + '.txt'))
    fileHandler.setFormatter(config.formatter)
    config.logger.addHandler(fileHandler)

    print("kcc research")
    print("")
    log.info("kcc research")
    log.info("")

    log.info("model_name : " + str(MODEL_NAME))
    log.info("max_len : " + str(MAX_LEN))
    log.info("data_name : " + str(DATA_NAME))
    print("model_name : " + str(MODEL_NAME))
    print("max_len : " + str(MAX_LEN))
    print("data_name : " + str(DATA_NAME))
    config.data_name = DATA_NAME

    # load data
    print("loading data ...")
    log.info("loading data ...")
    train_data, test_data = data_util.load_data(DATA_NAME)
    print("")
    log.info("")

    if (MODEL_NAME == 'lstm' or MODEL_NAME == 'summalstm'):  # lstm is baseline

        lstm(MODEL_NAME, train_data, test_data, MAX_LEN)

    elif (MODEL_NAME == 'gru' or MODEL_NAME == 'summagru'):  # gru is baseline

        gru(MODEL_NAME, train_data, test_data, MAX_LEN)

    elif (MODEL_NAME == 'bert' or MODEL_NAME == 'summabert'):  # bert

        bert(MODEL_NAME, train_data, test_data, MAX_LEN)

    elif (MODEL_NAME == 'kobert' or MODEL_NAME == 'summakobert'):  # kobert

        kobert(MODEL_NAME, train_data, test_data, MAX_LEN)
コード例 #13
0
ファイル: ner_bilstm_model.py プロジェクト: baraklevyy/NLP
def do_training(args):
    torch.manual_seed(133)
    # Set up configuration and output
    config = Config(args)
    if not os.path.exists(config.output_path):
        os.makedirs(config.output_path)

    # Set up logging
    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(
        logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    # Load data
    helper, data = load_data(args)
    train_examples = data['train_examples']
    dev_examples = data['dev_examples']
    helper.save(config.output_path)

    # Load embeddings
    embeddings = load_embeddings(args, helper, config.device)

    # Initialize model
    logger.info("Initializing model...", )
    model = NerBiLstmModel(helper, config, embeddings)
    model.to(config.device)

    # Preprocess data
    data_preprocessor = DataPreprocessor(model, config, helper)
    train_examples = data_preprocessor.preprocess_sequence_data(train_examples)
    dev_examples = data_preprocessor.preprocess_sequence_data(dev_examples)

    # Start training
    trainer = Trainer(model, config, helper, logger)
    logger.info("Starting training...", )
    trainer.train(train_examples, dev_examples)

    # Save predictions of the best model
    logger.info(
        "Training completed, saving predictions of the best model...", )
    with torch.no_grad():
        model.load_state_dict(torch.load(config.model_output))
        model.eval()
        predictor = Predictor(model, config)
        output = predictor.predict(dev_examples, use_str_labels=True)
        sentences, labels, predictions = zip(*output)
        predictions = [[LBLS[l] for l in preds] for preds in predictions]
        output = list(zip(sentences, labels, predictions))

        with open(model.config.conll_output, 'w') as f:
            write_conll(f, output)
        with open(model.config.eval_output, 'w') as f:
            for sentence, labels, predictions in output:
                print_sentence(f, sentence, labels, predictions)
コード例 #14
0
def main():
    """
    main function
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', type=str, default='./nela-17/whole')

    args = parser.parse_args()

    data_path = args.data_path
    print("data_path : {}".format(data_path))

    dataset = load_data(data_path)

    bert(dataset)
コード例 #15
0
ファイル: 12.py プロジェクト: Elinor78/face-hw2
def _cnn():

    train_data, train_target, test_data = load_data()  # load data from utility
    train_data, validation_data, train_target, validation_target = train_test_split(
        train_data, train_target, test_size=0.2, random_state=42
    )  #r andomly split data into traingin and validation sets

    test_data, input_shape = _reshape(test_data)  # see docstring
    train_data, input_shape = _reshape(train_data)  # see docstring
    validation_data, input_shape = _reshape(validation_data)  # see docstring

    model = Sequential()  # sequential model

    model.add(
        convolutional.Conv2D(  # first convolitional layer
            filters=32,
            kernel_size=(3, 3),
            activation='relu',
            input_shape=input_shape))

    model.add(convolutional.Conv2D(
        64, (3, 3),
        activation='relu'))  # 2nd convo layer, using relu for activation
    model.add(pooling.MaxPooling2D(pool_size=(2, 2)))  #1st pooling

    model.add(Dropout(0.25))  # prevent overfit w/dropout 1
    model.add(Flatten())  # flatten for dnn
    model.add(Dense(128, activation='relu'))  # 1st dnn layer

    model.add(Dropout(0.5))  # prevent overfit w/dropout 2
    model.add(Dense(3, activation='softmax'))  # using softmax for activation

    model.compile(  # compile using Adadelta for optimizer
        loss=keras.losses.categorical_crossentropy,
        optimizer=keras.optimizers.Adadelta(),
        metrics=['accuracy'])

    model.fit(train_data, train_target, batch_size=128, epochs=12,
              verbose=1)  # fit using training data
    loss, accuracy = model.evaluate(
        validation_data, validation_target,
        verbose=0)  # evaluate using validation data
    print "accuracy: {}".format(accuracy)

    class_output = model.predict_classes(test_data)  # predict on test_data
    return class_output
コード例 #16
0
def main():
    test_size = 0.3
    train_dev, _ = data_util.load_data(test_size=test_size)

    clf = train_and_plot(train_dev, test_size=test_size)
    features = train_dev.negative.columns

    path = os.path.abspath(
        os.path.join(os.path.dirname(__file__),
                     "../../asp/weights_learned.lp"))

    with open(path, "w") as f:
        f.write("% Generated with `python draco/learn/linear.py`.\n\n")

        for feature, weight in zip(features, clf.coef_[0]):
            f.write(f"#const {feature}_weight = {int(weight * 1000)}.\n")

    logger.info(f"Wrote model to {path}")
コード例 #17
0
 def score(self):
     loader, iteration = data_util.load_data(partition='test')
     
     data_iter = data_util.inf_generator(loader)
     correct = 0
     for i in range(iteration):
         X, y = data_iter.__next__()
         
         X=[x.numpy()[0] for x in X] 
         
         predX = Variable(torch.FloatTensor([X]), requires_grad=True).to(device)
         y = Variable(torch.LongTensor([y]), requires_grad=False).to(device)
         
         y_pred = self.model(predX)
         
         if y_pred.max(-1)[1]==y:
             correct += 1
     return correct/iteration
コード例 #18
0
def main():
    # Step 1: load data
    X, Y, groups = load_data(track_path=TRACK_PATH,
                             bed_path=BED_PATH,
                             group_path=GROUP_PATH)
    x_train, y_train, x_test, y_test = split_train_test(X, Y, groups, N_SPLITS)

    # Step 2: run models
    models = [
        CNN_Builder.build_Sequential(model_config)
        for model_config in model_configs
    ]
    for model in models:
        compile_model(model, metric_names=("auprc", "auroc"), optimizer="adam")
    histories = [
        fit_model(model,
                  x_train,
                  y_train,
                  x_test,
                  y_test,
                  batch_size=BATCH_SIZE,
                  epochs=EPOCHS,
                  use_class_weight=USE_CLASS_WEIGHT,
                  use_sample_weight=USE_SAMPLE_WEIGHT,
                  use_reduce_rl=USE_REDUCE_RL,
                  use_early_stopping=USE_EARLY_STOPPING,
                  verbose=VERBOSE) for model in models
    ]

    # Step 3: save artifacts
    script_fn = os.path.basename(__file__)
    folder = script_fn.split(".py")[0]
    model_names = [
        get_model_name(model_configs[i], i + 1)
        for i in range(len(model_configs))
    ]

    save_single_metric_of_multi_models(folder, model_names, histories,
                                       "val_auprc")
    save_all_metrics_of_multi_models(folder, model_names, histories)
    save_history_copies(folder, model_names, histories)

    print("{} finished!".format(script_fn))
コード例 #19
0
    def __init__(self):
        # logging
        self.logger = logging.getLogger("NER_model")

        # 参数
        self.n_word_features = 1  # 词本身的特征数
        self.window_size = 2
        self.n_features = (2 * self.window_size +
                           1) * self.n_word_features  # 词在一个窗口范围内的feature数
        self.max_length = 200  # 最大长度
        self.embed_size = 50  # 词向量维度
        self.hidden_size = 300  # 隐藏层大小
        self.batch_size = 32  # 批次大小(256太大)
        self.n_epochs = 1  # 轮次
        self.dropout = 0.5
        self.learning_rate = 0.001

        # 更换语料时,可能需要更改的
        self.n_classes = 5  # label类别个数
        self.LBLS = ["PER", "ORG", "LOC", "MISC", "O"]  # 类别标签

        self.vocab_file = "data/vocab.txt"
        self.word2vec_file = "data/word2vec.txt"

        self.output_path = "results"  # 保存结果路径
        self.model_output = "model/model.weights"  # 保存模型(train时)
        self.log_dir = 'logs'

        all_data = load_data()
        self.helper = all_data[
            0]  # helper[0],即tok2id——字典(词——id),helper[1]——训练集里的最大句子长度
        # train_raw, dev_raw, test_raw = all_data[1]  # 原始字符表示[([词1,词2...],[标签1,标签2...]),()]
        self.train_vec, self.dev_vec, self.test_vec = all_data[
            2]  # 字符转数字[([[id1], [id2]...], [0,1,0,2,3...]),()]
        self.train_set, self.dev_set, self.test_set = all_data[
            3]  # 每个词取窗口内的词,每个句子padding为定长
        self.helper.save(self.output_path)  # 存储为features.pkl,词典

        self.model()
コード例 #20
0
def main(argv):
    args = parser.parse_args(argv[1:])

    (train_x, train_y), (test_x, test_y) = data_util.load_data()
    feat_columns = []
    for key in train_x.keys():
        feat_columns.append(tf.feature_column.numeric_column(key=key))

    classifier = tf.estimator.DNNClassifier(
        feature_columns=feat_columns,
        # Two hidden layers of 10 nodes each.
        hidden_units=[10, 10],
        # The model must choose between 6 classes.
        n_classes=6)

    classifier.train(input_fn=lambda: data_util.train_input_fn(
        train_x, train_y, args.batch_size),
                     steps=args.train_steps)

    eval_result = classifier.evaluate(input_fn=lambda: data_util.eval_input_fn(
        test_x, test_y, args.batch_size))

    print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
コード例 #21
0
    def score(self):
        '''
        Returns the top-1 accuracy on an unseen test dataset.
        '''
        loader, iteration = data_util.load_data(partition='test')
        #iteration = 1
        data_iter = data_util.inf_generator(loader)
        correct = 0
        for i in range(iteration):
            X, y = data_iter.__next__()

            X = [x.numpy()[0] for x in X]

            predX = Variable(torch.FloatTensor([X]),
                             requires_grad=True).to(device)
            y = Variable(torch.LongTensor([y]), requires_grad=False).to(device)

            y_pred = self.model(predX)

            if y_pred.max(
                    -1
            )[1] == y:  # if class corresponding to max log softmax is the ground truth class
                correct += 1
        return correct / iteration
コード例 #22
0
tf.flags.DEFINE_string('model_path', './toy_runs/model.ckpt',
                       'path for saving trained parameter')
tf.flags.DEFINE_string('source_file', '../data/letters_source.txt',
                       'path for source file')
tf.flags.DEFINE_integer('print_every', 20,
                        'steps for displaying status of training')

# checkpoint dir check
if not os.path.exists(FLAGS.model_dir):
    os.makedirs(FLAGS.model_dir)

# config loading
config = config.Config()

# data loading
train_source, train_target, valid_source, valid_target = data_util.load_data(
    FLAGS.source_file)
batch_generator = data_util.get_batches(train_source, train_target,
                                        config.num_epochs, config.batch_size)
val_generator = data_util.get_batches(valid_source, valid_target, 500,
                                      config.batch_size)

# model loading
model = Seq2Seq(config, mode='train')
model.build()
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# training
for batch in batch_generator:
    train_loss, step, summary = model.train(sess, *batch)
    if step % FLAGS.print_every == 0:
コード例 #23
0
    def fit(self, name, save_weights=False):
        '''
        Trains model using predefined number of epochs, learning rate and number of neurons in
        each hidden layer. Saves epoch results to a file name.csv, where name is replaced with the
        value put in for the name parameter.
        
        name: (str) The name of the file to save the results of this run
        save_weights: (bool) Saves the weights of the best iteration based on validation accuracy
        '''
        print(name)
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        loss = nn.CrossEntropyLoss()  # cross-entropy loss
        lossVal = []  # to create dataframe for logging
        bestValAcc = 0  # to check for saving weights
        for i in range(self.epochs):
            start = time.time()
            loader, iteration = data_util.load_data()
            data_iter = data_util.inf_generator(
                loader
            )  # inf generator taken from rtiqchen implementation of NODE
            train_size = int(iteration *
                             0.8)  # takes 80% of data as train and 20 as test
            val_size = int(iteration * 0.2)
            epoch_train_loss = [
            ]  # collect values to update log for post-processing
            epoch_val_loss = []
            train_correct = 0
            val_correct = 0

            for j in range(
                    train_size
            ):  #calculated train size to do train dev split will calculate mean loss at end
                X, y = data_iter.__next__()

                X = [x.numpy()[0] for x in X]

                X = Variable(torch.FloatTensor([X]), requires_grad=True).to(
                    device)  # have to convert to tensor
                #print(X.shape)

                y = Variable(torch.LongTensor([y]),
                             requires_grad=False).to(device)

                optimizer.zero_grad()
                y_pred = self.model(X)
                output = loss(y_pred, y)
                epoch_train_loss.append(output.cpu().detach().numpy())
                if y_pred.max(
                        -1
                )[1] == y:  # if max log softmax corresponds to correct class add 1
                    train_correct += 1

                output.backward()
                optimizer.step()

                #print(list(self.model.parameters()))

            for k in range(val_size):
                X, y = data_iter.__next__()

                X = [x.numpy()[0] for x in X]

                X = Variable(torch.FloatTensor([X]), requires_grad=True).to(
                    device)  # have to convert to tensor

                y = Variable(torch.LongTensor([y]),
                             requires_grad=False).to(device)
                optimizer.zero_grad()
                y_pred = self.model(X)
                output = loss(y_pred, y)
                epoch_val_loss.append(output.cpu().detach().numpy())
                if y_pred.max(
                        -1
                )[1] == y:  # if max log softmax corresponds to correct class add 1
                    val_correct += 1
                valAcc = val_correct / val_size
            if save_weights and valAcc > bestValAcc:
                torch.save(self.model.state_dict(),
                           f'../model_weights/{name}.pt'
                           )  # save if we do better than current best

            end = time.time()
            lossVal.append([(end - start) / 60,
                            np.mean(epoch_train_loss),
                            np.mean(epoch_val_loss),
                            train_correct / train_size, val_correct / val_size
                            ])  # save values for reporting
            print('epoch time:', (end - start) / 60, 'min', 'epoch:',
                  '{0}/{1}'.format(i, self.epochs), 'train accuracy:',
                  train_correct / train_size, ', val accuracy:',
                  val_correct / val_size)
            print(
                f'Train loss: {np.mean(epoch_train_loss)}     Val loss: {np.mean(epoch_val_loss)}'
            )
        if 'model_train_results' not in os.listdir('../'):
            os.mkdir('../model_train_results')

        pd.DataFrame(lossVal,
                     columns=[
                         'epoch_time', 'mean_train_loss', 'mean_val_loss',
                         'train_acc', 'val_acc'
                     ]).to_csv('../model_train_results/' + name + '.csv',
                               index=False)  # add epoch length
コード例 #24
0
def main(_):
    #PART1############################################PREPARE DATA FOR TRAINING############################################
    #1.create vocab
    #_build_vocab(FLAGS.data_en_path, FLAGS.vocabulary_en_path, FLAGS.vocabulary_size_en)
    _build_vocab_en(FLAGS.word2vec_model_path, FLAGS.vocabulary_en_path,
                    FLAGS.vocabulary_size_en)
    _build_vocab(FLAGS.data_cn_path, FLAGS.vocabulary_cn_path,
                 FLAGS.vocabulary_size_cn)
    #2.load vocab
    vocab_cn, vocab_en = load_vocab_as_dict(FLAGS.vocabulary_cn_path,
                                            FLAGS.vocabulary_en_path)
    vocab_en_index2word = dict(
        [val, key] for key, val in vocab_en.items())  #get reverse order.
    #3.load data
    train, valid = load_data(FLAGS.data_folder,
                             FLAGS.data_cn_path,
                             FLAGS.data_en_path,
                             FLAGS.data_en_processed_path,
                             vocab_cn,
                             vocab_en,
                             FLAGS.data_cn_valid_path,
                             FLAGS.data_en_valid_path,
                             FLAGS.sequence_length,
                             test_mode=FLAGS.test_mode)
    trainX, trainY_input, trainY_output = train
    testX, testY_input, testY_output = valid
    #4. print sample data
    print("trainX:", trainX[0:10])
    print("trainY_input:", trainY_input[0:10])
    print("trainY_output:", trainY_output[0:10])
    print("testX:", testX[0:10])
    print("testY_input:", testY_input[0:10])
    print("testY_output:", testY_output[0:10])
    sequence_length_batch = [FLAGS.sequence_length] * FLAGS.batch_size
    # PART1############################################PREPARE DATA FOR TRAINING#############################################
    # PART2############################################TRAINING#############################################################
    # 2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # Instantiate Model
        model = seq2seq_attention_model(
            len(vocab_cn),
            FLAGS.learning_rate,
            FLAGS.batch_size,
            FLAGS.decay_steps,
            FLAGS.decay_rate,
            FLAGS.sequence_length,
            len(vocab_en),
            FLAGS.embed_size,
            FLAGS.hidden_size,
            sequence_length_batch,
            FLAGS.is_training,
            decoder_sent_length=FLAGS.decoder_sent_length,
            l2_lambda=FLAGS.l2_lambda)
        # Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if FLAGS.use_embedding:  # load pre-trained word embedding
                assign_pretrained_word_embedding(
                    sess,
                    vocab_en_index2word,
                    model,
                    word2vec_model_path=FLAGS.word2vec_model_path)
        curr_epoch = sess.run(model.epoch_step)
        # 3.feed data & training
        number_of_training_data = len(trainX)
        print("number_of_training_data:", number_of_training_data)
        previous_eval_loss = 10000
        best_eval_loss = 10000
        batch_size = FLAGS.batch_size
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                if epoch == 0 and counter == 0:  #print sample to have a look
                    print("trainX[start:end]:", trainX[start:end])
                feed_dict = {
                    model.input_x: trainX[start:end],
                    model.dropout_keep_prob: 0.5
                }
                feed_dict[model.decoder_input] = trainY_input[start:end]
                feed_dict[model.input_y_label] = trainY_output[start:end]
                curr_loss, curr_acc, _ = sess.run(
                    [model.loss_val, model.accuracy, model.train_op],
                    feed_dict)
                loss, counter, acc = loss + curr_loss, counter + 1, acc + curr_acc
                if counter % 50 == 0:
                    print(
                        "seq2seq_with_attention==>Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f"
                        % (epoch, counter, math.exp(loss / float(counter)) if
                           (loss / float(counter)) < 20 else 10000.000,
                           acc / float(counter)))
                ##VALIDATION VALIDATION VALIDATION PART######################################################################################################
                if start % (FLAGS.validate_step * FLAGS.batch_size) == 0:
                    eval_loss, _ = do_eval(sess, model, testX, testY_input,
                                           testY_output, batch_size)
                    print(
                        "seq2seq_with_attention.validation.part. previous_eval_loss:",
                        math.exp(previous_eval_loss) if previous_eval_loss < 20
                        else 10000.000, ";current_eval_loss:",
                        math.exp(eval_loss) if eval_loss < 20 else 10000.000)
                    if eval_loss > previous_eval_loss:  # if loss is not decreasing
                        # reduce the learning rate by a factor of 0.5
                        print(
                            "seq2seq_with_attention==>validation.part.going to reduce the learning rate."
                        )
                        learning_rate1 = sess.run(model.learning_rate)
                        lrr = sess.run([model.learning_rate_decay_half_op])
                        learning_rate2 = sess.run(model.learning_rate)
                        print(
                            "seq2seq_with_attention==>validation.part.learning_rate1:",
                            learning_rate1, " ;learning_rate2:",
                            learning_rate2)
                    else:  # loss is decreasing
                        if eval_loss < best_eval_loss:
                            print(
                                "seq2seq_with_attention==>going to save the model.eval_loss:",
                                math.exp(eval_loss) if eval_loss < 20 else
                                10000.000, ";best_eval_loss:",
                                math.exp(best_eval_loss)
                                if best_eval_loss < 20 else 10000.000)
                            # save model to checkpoint
                            save_path = FLAGS.ckpt_dir + "model.ckpt"
                            saver.save(sess, save_path, global_step=epoch)
                            best_eval_loss = eval_loss
                    previous_eval_loss = eval_loss
                    ##VALIDATION VALIDATION VALIDATION PART######################################################################################################

            # epoch increment
            print("going to increment epoch counter....")
            sess.run(model.epoch_increment)

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        #test_loss, test_acc = do_eval(sess, model, testX, testY, batch_size)
    pass
コード例 #25
0
import data_util
import os
from text_cnn import TextCNN
from config import Config
import csv

test_data_file = "data/test/tokenized_reviews.txt"
test_label_file = "data/test/labels.txt"
vocab_file = "data/vocab"
checkpoint_dir = "./save/checkpoints/cnn"
result_file = "./data/cnn_result.csv"
checkpoint_prefix = os.path.join(checkpoint_dir, "cnn")
max_vocab_size = 5e5
vocab = data_util.Vocab(vocab_file, max_vocab_size)
# load test data
test_docs, seq_len, max_len, test_labels = data_util.load_data(
    test_data_file, test_label_file, vocab)
config = Config(max_vocab_size, max_len)
model = TextCNN(config)
model.build()
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True
sess_config.gpu_options.per_process_gpu_memory_fraction = 0.9
sess = tf.Session(config=sess_config)
init = tf.global_variables_initializer()
sess.run(init)
ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
if ckpt and tf.train.get_checkpoint_state(checkpoint_dir):
    model.restore(sess, ckpt.model_checkpoint_path)
else:
    print("no checkpoint saved")
    exit()
コード例 #26
0
def main():
    parser = OptionParser()
    parser.add_option('-c',
                      '--checkpoint',
                      dest='checkpoint',
                      default='',
                      help='last checkpoint to load for testing')
    parser.add_option('-n',
                      '--model_name',
                      dest='model_name',
                      default='RotationalInvarianceModel',
                      help='model name for output')
    parser.add_option('-b',
                      '--batch_size',
                      dest='batch_size',
                      default=1000,
                      help='batch size for training')
    parser.add_option('-i',
                      '--input_db',
                      dest='input_db',
                      default='characters_test.sqlite',
                      help='input database path')
    options, args = parser.parse_args()

    if not os.path.isfile(options.input_db):
        logger.warning('%s does not exists' % (options.input_db))
        return

    if not os.path.isfile(options.checkpoint + '.index'):
        logger.warning('%s checkpoint does not exists' % (options.input_db))
        return

    logger.info('preparing data...')
    data, label = load_data(options.input_db, CHARACTERS)
    rotations = label[:, -1:]
    label = label[:, :-1]
    model = RotationalInvarianceModel(64, 3, 10, model_name=options.model_name)

    logger.info('starting tf Session...')
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        model.load(sess, options.checkpoint)
        try:
            batch_size = int(options.batch_size)
            num_batches = len(data) / batch_size

            accuracy = 0
            error = 0
            for n in range(num_batches):
                batch_data = data[n:n + batch_size, :]
                batch_label = label[n:n + batch_size, :]
                batch_rotation = rotations[n:n + batch_size, :]

                classes, rotate = model.predict(sess, batch_data)
                accuracy += compute_accuracy(classes, batch_label)
                error += compute_error(rotate, batch_rotation)
            accuracy /= num_batches
            error /= num_batches
            logger.info('Test Accuracy: %f | Test Rotation Error: %f' %
                        (accuracy, error))
        except KeyboardInterrupt:
            logger.info('stop testing.')
コード例 #27
0
num_layers = 1
summary_len = 100
beam_depth = 4
state_size = 50
mode = "train"
doc_file = "data/modified_train_article.txt"
sum_file = "data/modified_train_abstract.txt"
vocab_file = "data/vocab"
checkpoint_dir = "./save/quasi/checkpoints"
checkpoint_prefix = os.path.join(checkpoint_dir, "baseline")
dev_doc_file = "data/val_article.txt"
dev_sum_file = "data/val_abstract.txt"
# load source and target data
docs, sums, vocab = load_data(doc_file,
                              sum_file,
                              vocab_file,
                              max_vocab_size,
                              debug=debug,
                              max_num_tokens=max_num_tokens)
dev_docs, dev_sums = load_valid_data(dev_doc_file,
                                     dev_sum_file,
                                     vocab,
                                     max_num_tokens,
                                     debug=debug)
vocab_size = vocab.size()


# self, vocab_size, embedding_size, state_size, num_layers,
#                  decoder_vocab_size, attention_hidden_size, mode, beam_depth,
#                  learning_rate, max_iter=100, attention_mode="Bahdanau"):
def load_glove(glove_file, vocab, embedding_size):
    print("load pretrained glove from : {}".format(glove_file))
コード例 #28
0
def main(_):
    #if FLAGS.use_pingyin:
    vocabulary_word2index, vocabulary_index2word, vocabulary_label2index, vocabulary_index2label = create_vocabulary(
        FLAGS.traning_data_path,
        FLAGS.vocab_size,
        name_scope=FLAGS.model_name,
        tokenize_style=FLAGS.tokenize_style)
    vocab_size = len(vocabulary_word2index)
    print("cnn_model.vocab_size:", vocab_size)
    num_classes = len(vocabulary_index2label)
    print("num_classes:", num_classes)
    train, valid, test, true_label_percent = load_data(
        FLAGS.traning_data_path,
        vocabulary_word2index,
        vocabulary_label2index,
        FLAGS.sentence_len,
        FLAGS.model_name,
        tokenize_style=FLAGS.tokenize_style)
    trainX1, trainX2, trainBlueScores, trainY = train
    validX1, validX2, validBlueScores, validY = valid
    testX1, testX2, testBlueScores, testY = test
    length_data_mining_features = len(trainBlueScores[0])
    print("length_data_mining_features:", length_data_mining_features)
    #print some message for debug purpose
    print("model_name:", FLAGS.model_name, ";length of training data:",
          len(trainX1), ";length of validation data:", len(testX1),
          ";true_label_percent:", true_label_percent, ";tokenize_style:",
          FLAGS.tokenize_style, ";vocabulary size:", vocab_size)
    print("train_x1:", trainX1[0], ";train_x2:", trainX2[0])
    print("data mining features.length:", len(trainBlueScores[0]),
          "data_mining_features:", trainBlueScores[0], ";train_y:", trainY[0])
    #2.create session.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        #Instantiate Model
        textCNN = DualBilstmCnnModel(
            filter_sizes,
            FLAGS.num_filters,
            num_classes,
            FLAGS.learning_rate,
            FLAGS.batch_size,
            FLAGS.decay_steps,
            FLAGS.decay_rate,
            FLAGS.sentence_len,
            vocab_size,
            FLAGS.embed_size,
            FLAGS.hidden_size,
            FLAGS.is_training,
            model=FLAGS.model_name,
            similiarity_strategy=FLAGS.similiarity_strategy,
            top_k=FLAGS.top_k,
            max_pooling_style=FLAGS.max_pooling_style,
            length_data_mining_features=length_data_mining_features)
        #Initialize Save
        saver = tf.train.Saver()
        if os.path.exists(FLAGS.ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint.")
            saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))
            if FLAGS.decay_lr_flag:
                #trainX1, trainX2, trainY = shuffle_data(trainX1, trainX2, trainY)
                for i in range(2):  # decay learning rate if necessary.
                    print(i, "Going to decay learning rate by half.")
                    sess.run(textCNN.learning_rate_decay_half_op)
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())
            if not os.path.exists(FLAGS.ckpt_dir):
                os.makedirs(FLAGS.ckpt_dir)

            if FLAGS.use_pretrained_embedding:  #load pre-trained word embedding
                print("===>>>going to use pretrained word embeddings...")
                assign_pretrained_word_embedding(sess, vocabulary_index2word,
                                                 vocab_size, textCNN,
                                                 FLAGS.word2vec_model_path)
        curr_epoch = sess.run(textCNN.epoch_step)
        #3.feed data & training
        number_of_training_data = len(trainX1)
        batch_size = FLAGS.batch_size
        iteration = 0
        best_acc = 0.60
        best_f1_score = 0.20
        weights_dict = init_weights_dict(
            vocabulary_label2index)  #init weights dict.
        for epoch in range(curr_epoch, FLAGS.num_epochs):
            print("Auto.Going to shuffle data")
            trainX1, trainX2, trainBlueScores, trainY = shuffle_data(
                trainX1, trainX2, trainBlueScores, trainY)
            loss, eval_acc, counter = 0.0, 0.0, 0
            for start, end in zip(
                    range(0, number_of_training_data, batch_size),
                    range(batch_size, number_of_training_data, batch_size)):
                iteration = iteration + 1
                input_x1, input_x2, input_bluescores, input_y = generate_batch_training_data(
                    trainX1, trainX2, trainBlueScores, trainY,
                    number_of_training_data, batch_size)
                #input_x1=trainX1[start:end]
                #input_x2=trainX2[start:end]
                #input_bluescores=trainBlueScores[start:end]
                #input_y=trainY[start:end]
                weights = get_weights_for_current_batch(input_y, weights_dict)

                feed_dict = {
                    textCNN.input_x1: input_x1,
                    textCNN.input_x2: input_x2,
                    textCNN.input_bluescores: input_bluescores,
                    textCNN.input_y: input_y,
                    textCNN.weights: np.array(weights),
                    textCNN.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    textCNN.iter: iteration,
                    textCNN.tst: not FLAGS.is_training
                }
                curr_loss, curr_acc, lr, _, _ = sess.run([
                    textCNN.loss_val, textCNN.accuracy, textCNN.learning_rate,
                    textCNN.update_ema, textCNN.train_op
                ], feed_dict)
                loss, eval_acc, counter = loss + curr_loss, eval_acc + curr_acc, counter + 1
                if counter % 100 == 0:
                    print(
                        "Epoch %d\tBatch %d\tTrain Loss:%.3f\tAcc:%.3f\tLearning rate:%.5f"
                        % (epoch, counter, loss / float(counter),
                           eval_acc / float(counter), lr))
                #middle checkpoint
                #if start!=0 and start%(500*FLAGS.batch_size)==0: # eval every 3000 steps.
                #eval_loss, acc,f1_score, precision, recall,_ = do_eval(sess, textCNN, validX1, validX2, validY,iteration)
                #print("【Validation】Epoch %d Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f" % (epoch, acc,eval_loss, f1_score, precision, recall))
                # save model to checkpoint
                #save_path = FLAGS.ckpt_dir + "model.ckpt"
                #saver.save(sess, save_path, global_step=epoch)
            #epoch increment
            print("going to increment epoch counter....")
            sess.run(textCNN.epoch_increment)

            # 4.validation
            print(epoch, FLAGS.validate_every,
                  (epoch % FLAGS.validate_every == 0))

            if epoch % FLAGS.validate_every == 0:
                eval_loss, eval_accc, f1_scoree, precision, recall, weights_label = do_eval(
                    sess, textCNN, validX1, validX2, validBlueScores, validY,
                    iteration, vocabulary_index2word)
                weights_dict = get_weights_label_as_standard_dict(
                    weights_label)
                print("label accuracy(used for label weight):==========>>>>",
                      weights_dict)
                print(
                    "【Validation】Epoch %d\t Loss:%.3f\tAcc %.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f"
                    % (epoch, eval_loss, eval_accc, f1_scoree, precision,
                       recall))
                #save model to checkpoint
                if eval_accc * 1.05 > best_acc and f1_scoree > best_f1_score:
                    save_path = FLAGS.ckpt_dir + "model.ckpt"
                    print("going to save model. eval_f1_score:", f1_scoree,
                          ";previous best f1 score:", best_f1_score,
                          ";eval_acc", str(eval_accc), ";previous best_acc:",
                          str(best_acc))
                    saver.save(sess, save_path, global_step=epoch)
                    best_acc = eval_accc
                    best_f1_score = f1_scoree

                if FLAGS.decay_lr_flag and (epoch != 0 and
                                            (epoch == 1 or epoch == 3
                                             or epoch == 5 or epoch == 8)):
                    #TODO print("Auto.Restoring Variables from Checkpoint.")
                    #TODO saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir))

                    for i in range(2):  # decay learning rate if necessary.
                        print(i, "Going to decay learning rate by half.")
                        sess.run(textCNN.learning_rate_decay_half_op)

        # 5.最后在测试集上做测试,并报告测试准确率 Test
        test_loss, acc_t, f1_score_t, precision, recall, weights_label = do_eval(
            sess, textCNN, testX1, testX2, testBlueScores, testY, iteration,
            vocabulary_index2word)
        print(
            "Test Loss:%.3f\tAcc:%.3f\tF1 Score:%.3f\tPrecision:%.3f\tRecall:%.3f:"
            % (test_loss, acc_t, f1_score_t, precision, recall))
    pass
コード例 #29
0
def train():
    logging.info("Preparing summarization data.")
    docid, sumid, doc_dict, sum_dict = \
        data_util.load_data(
            FLAGS.data_dir + "/train/train.article.txt",
            FLAGS.data_dir + "/train/train.title.txt",
            FLAGS.data_dir + "/doc_dict.txt",
            FLAGS.data_dir + "/sum_dict.txt",
            FLAGS.doc_vocab_size, FLAGS.sum_vocab_size)

    val_docid, val_sumid = \
        data_util.load_valid_data(
            FLAGS.data_dir + "/train/valid.article.filter.txt",
            FLAGS.data_dir + "/train/valid.title.filter.txt",
            doc_dict, sum_dict)

    with tf.Session() as sess:
        # Create model.
        logging.info("Creating %d layers of %d units." %
                     (FLAGS.num_layers, FLAGS.size))
        train_writer = tf.summary.FileWriter(FLAGS.tfboard, sess.graph)
        model = create_model(sess, False)

        # Read data into buckets and compute their sizes.
        logging.info("Create buckets.")
        dev_set = create_bucket(val_docid, val_sumid)
        train_set = create_bucket(docid, sumid)

        train_bucket_sizes = [len(train_set[b]) for b in range(len(_buckets))]
        train_total_size = float(sum(train_bucket_sizes))
        train_buckets_scale = [
            sum(train_bucket_sizes[:i + 1]) / train_total_size
            for i in range(len(train_bucket_sizes))
        ]

        for (s_size, t_size), nsample in zip(_buckets, train_bucket_sizes):
            logging.info("Train set bucket ({}, {}) has {} samples.".format(
                s_size, t_size, nsample))

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = sess.run(model.global_step)

        while current_step <= FLAGS.max_iter:
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, encoder_len, decoder_len = \
                model.get_batch(train_set, bucket_id)
            step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs,
                                      encoder_len, decoder_len, False,
                                      train_writer)

            step_time += (time.time() - start_time) / \
                FLAGS.steps_per_validation
            loss += step_loss * FLAGS.batch_size / np.sum(decoder_len) \
                / FLAGS.steps_per_validation
            current_step += 1

            # Once in a while, we save checkpoint.
            if current_step % FLAGS.steps_per_checkpoint == 0:
                # Save checkpoint and zero timer and loss.
                checkpoint_path = os.path.join(FLAGS.train_dir, "model.ckpt")
                model.saver.save(sess,
                                 checkpoint_path,
                                 global_step=model.global_step)

            # Once in a while, we print statistics and run evals.
            if current_step % FLAGS.steps_per_validation == 0:
                # Print statistics for the previous epoch.
                perplexity = np.exp(float(loss))
                logging.info("global step %d step-time %.2f ppl %.2f" %
                             (model.global_step.eval(), step_time, perplexity))

                step_time, loss = 0.0, 0.0
                # Run evals on development set and print their perplexity.
                for bucket_id in range(len(_buckets)):
                    if len(dev_set[bucket_id]) == 0:
                        logging.info("  eval: empty bucket %d" % (bucket_id))
                        continue
                    encoder_inputs, decoder_inputs, encoder_len, decoder_len =\
                        model.get_batch(dev_set, bucket_id)
                    eval_loss, _ = model.step(sess, encoder_inputs,
                                              decoder_inputs, encoder_len,
                                              decoder_len, True)
                    eval_loss = eval_loss * FLAGS.batch_size \
                        / np.sum(decoder_len)
                    eval_ppx = np.exp(float(eval_loss))
                    logging.info("  eval: bucket %d ppl %.2f" %
                                 (bucket_id, eval_ppx))
                sys.stdout.flush()
コード例 #30
0
def main():
    train_x, train_y, val_x, val_y, test_x, test_y, vocab_size = load_data()

    label_size = 10
    learning_rate = 0.01
    batch_size = 128
    decay_steps = 20000
    decay_rate = 0.8
    ckpt_dir = "fast_text_checkpoint/"
    sentence_len = 200
    embed_size = 100
    is_training = True
    num_epochs = 15
    validate_every = 1


    print("start padding...")

    train_x = pad_sequences(train_x, maxlen=sentence_len, value = 0)
    val_x = pad_sequences(val_x, maxlen=sentence_len, value = 0)
    test_x = pad_sequences(test_x, maxlen=sentence_len, value=0)
    print("end padding...")

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Session(config = config) as sess:

        fast_text = fastText(label_size = 10,
                             learning_rate = 0.01,
                             batch_size = 128,
                             decay_step = 20000,
                             decay_rate = 0.8,
                             sentence_len =  200,
                             vocab_size = vocab_size,
                             embed_size = 100,
                             is_training = True)

        saver = tf.train.Saver()
        if os.path.exists(ckpt_dir + "checkpoint"):
            print("Restoring Variables from Checkpoint")
            saver.restore(sess, tf.train.latest_checkpoint(ckpt_dir))
        else:
            print('Initializing Variables')
            sess.run(tf.global_variables_initializer())

        curr_epoch = sess.run(fast_text.epoch_step)

        number_of_training_data = len(train_x)
        batch_size = batch_size

        for epoch in range(curr_epoch, num_epochs):
            loss, acc, counter = 0.0, 0.0, 0
            for start, end in zip(range(0, number_of_training_data, batch_size), range(batch_size, number_of_training_data, batch_size)):

                if epoch == 0 and counter == 0:
                    print("trainX[start:end]:",train_x[start:end].shape)
                    print("trainY[start:end]:",train_y[start:end].shape)


                curr_loss, curr_acc, _ = sess.run([fast_text.loss_val, fast_text.accuracy, fast_text.train_op],
                                                  feed_dict= \
                                                      {   fast_text.sentence : train_x[start : end],
                                                          fast_text.labels : train_y[start : end]}
                                                  )
                loss, acc, counter = loss + curr_loss, acc + curr_acc, counter + 1

                if counter % 500 == 0:
                    print(epoch)
                    print(counter)
                    print(loss)
                    print(acc)
                    print("Epoch %d\tBatch %d\tTrain Loss:%.3f\tTrain Accuracy:%.3f" % (epoch, counter, loss / float(counter), acc / float(counter)))

            print("going to increment epoch counter....")
            sess.run(fast_text.epoch_increment)

            print(epoch, validate_every, (epoch % validate_every == 0))

            if epoch % validate_every == 0:
                eval_loss, eval_acc = do_eval(sess, fast_text, val_x, val_y, batch_size)
                print("Epoch %d Validation Loss:%.3f\tValidation Accuracy: %.3f" % (epoch, eval_loss, eval_acc))

                # save model to checkpoint
                save_path = ckpt_dir + "model.ckpt"
                saver.save(sess, save_path, global_step=fast_text.epoch_step)  # fast_text.epoch_step

        test_loss, test_acc = do_eval(sess, fast_text, test_x, test_y, batch_size)
        print("test Loss:%.3f\ttest Accuracy: %.3f" % (test_loss, test_acc))
    return