Beispiel #1
0
def load_data_nonclass(filepath_news, filepath_stock):
    newslist = preprocess.read_news(filepath_news)
    sentences = preprocess.news_to_sentences(newslist)
    prices = preprocess.read_price(filepath_stock)
    prices = scaler.fit_transform(prices)
    news = preprocess.sentences_to_nparray(sentences)
    # news = scaler_news.fit_transform(news)
    hisprice, y = preprocess.data_process(prices, look_back)
    return news[look_back:], hisprice, y
def inference(model):
    print("Loading test data...")
    start_time = time.time()
    x_test, y_test = data_process(valid_dir, config.seq_length)

    # Create session and restore model checkpoint
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=saver_dir)

    print('Testing...')
    loss_test, acc_test = evaluate(session, x_test, y_test)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    # Generate data with batch
    batch_size = 128
    data_len = len(x_test)
    num_batch = int((data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)
    y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)

    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        feed_dict = {
            model.input_x: x_test[start_id:end_id],
            model.keep_prob: 1.0
        }
        y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls,
                                                  feed_dict=feed_dict)

    # Evaluate
    print("Precision, Recall and F1-Score...")
    print(
        metrics.classification_report(y_test_cls,
                                      y_pred_cls,
                                      target_names=label_category))

    # Print confusion matrix
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
    print(cm)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)
Beispiel #3
0
def get_dataloader(args):
    data = None
    setattr(args, 'mode', 'test')
    if args.raw_file:
        source = data_process(filelist=[args.raw_file],
                              word2index=args.src_word2index,
                              lower=args.lower)
        del args.src_word2index
        max_src_len = max(len(seq) for seq in source)

        data = {'source': source, 'max_src_len': max_src_len}

    dataset, batch_size = get_data(args=args, data=data)
    dataset = DataLoader(dataset=dataset,
                         batch_size=batch_size,
                         shuffle=False,
                         num_workers=0,
                         pin_memory=True)
    setattr(args, 'data', dataset)
def train(model):
    """Train model: split train data, train model, model save and result print"""

    # config tensor board and summary
    print('Configuring TensorBoard and Saver ...')
    if not os.path.exists(tensorboard_dir):
        os.mkdir(tensorboard_dir)
    tf.summary.scalar('loss', model.loss)
    tf.summary.scalar('accuracy', model.acc)
    merged_summary = tf.summary.merge_all()
    summary_writer = tf.summary.FileWriter(tensorboard_dir)

    # config graph-saver
    saver = tf.train.Saver()
    if not os.path.exists(saver_dir):
        os.mkdir(saver_dir)

    # Loading trianing data and validation data
    print('Loading trianing data and validation data ...')
    start_time = time.time()
    x_train, y_train = data_process(train_dir, config.max_length)
    x_valid, y_valid = data_process(valid_dir, config.max_length)
    time_dif = get_time_dif(start_time)
    print('Loading data ok!')
    print('Time usage: %f' % time_dif)

    # Create session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    summary_writer.add_graph(session.graph)

    # Some variables about training
    total_batch = 0
    best_val_acc = 0.
    last_improved = 0.
    early_stop_batch = 1000

    print('Training and evaluating ...')
    start_time = time.time()
    is_early_stop = False
    for epoch in range(config.num_epochs):
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, config.batch_size)

        for x_batch, y_batch in batch_train:
            feed_dict = {
                model.input_x: x_batch,
                model.input_y: y_batch,
                model.dropout_keep_prob: config.dropout_keep_prob
            }

            # Every saver_epochs, save summary
            if total_batch % config.save_per_batch == 0:
                graph = session.run(merged_summary, feed_dict=feed_dict)
                summary_writer.add_summary(graph, total_batch)

            # Print result
            if total_batch % config.print_per_batch == 0:
                feed_dict[model.dropout_keep_prob] = 1.0
                loss_train, acc_train = session.run([model.loss, model.acc],
                                                    feed_dict=feed_dict)
                print()
                loss_val, acc_val = evaluate(session, x_valid, y_valid)

                # save best model by acc
                if acc_val > best_val_acc:
                    best_val_acc = acc_val
                    last_improved = total_batch
                    saver.save(sess=session, save_path=saver_dir)
                    improved_str = '//improved'
                else:
                    improved_str = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                      + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
                print(
                    msg.format(total_batch, loss_train, acc_train, loss_val,
                               acc_val, time_dif, improved_str))

            # Optimizer model
            session.run(model.optim, feed_dict=feed_dict)
            total_batch += 1

            # Early stop
            if total_batch - last_improved > early_stop_batch:
                print("No optimization for a long time, auto-stopping...")
                is_early_stop = True
                break
        if is_early_stop:
            break
Beispiel #5
0
def train(chm, model_name, data_path, generations, window_size, smooth_size,
          missing, n_cores, verbose):

    if verbose:
        print("Preprocessing data...")

    # ------------------ Config ------------------
    model_name += "_chm_" + chm
    model_repo = join_paths("./" + instance_name, "models", verb=False)
    model_repo = join_paths(model_repo, model_name, verb=False)
    model_path = model_repo + "/" + model_name + ".pkl"

    train1_paths = [
        data_path + "/chm" + chm + "/simulation_output/train1/gen_" +
        str(gen) + "/" for gen in generations
    ]
    train2_paths = [
        data_path + "/chm" + chm + "/simulation_output/train2/gen_" +
        str(gen) + "/" for gen in generations
    ]
    val_paths = [
        data_path + "/chm" + chm + "/simulation_output/val/gen_" + str(gen) +
        "/" for gen in generations
    ]  # only validate on 4th gen

    position_map_file = data_path + "/chm" + chm + "/positions.txt"
    reference_map_file = data_path + "/chm" + chm + "/references.txt"
    population_map_file = data_path + "/populations.txt"

    # ------------------ Process data ------------------
    # gather feature data files (binary representation of variants)
    X_fname = "mat_vcf_2d.npy"
    X_train1_files = [p + X_fname for p in train1_paths]
    X_train2_files = [p + X_fname for p in train2_paths]
    X_val_files = [p + X_fname for p in val_paths]

    # gather label data files (population)
    labels_fname = "mat_map.npy"
    labels_train1_files = [p + labels_fname for p in train1_paths]
    labels_train2_files = [p + labels_fname for p in train2_paths]
    labels_val_files = [p + labels_fname for p in val_paths]

    # load the data
    train_val_files = [
        X_train1_files, labels_train1_files, X_train2_files,
        labels_train2_files, X_val_files, labels_val_files
    ]
    X_train1_raw, labels_train1_raw, X_train2_raw, labels_train2_raw, X_val_raw, labels_val_raw = [
        load_np_data(f) for f in train_val_files
    ]

    # adding generation 0
    if gen_0:
        if verbose:
            print("Including generation 0...")

        # get it
        gen_0_sets = ["train1", "train2"]
        X_train1_raw_gen_0, y_train1_raw_gen_0, X_train2_raw_gen_0, y_train2_raw_gen_0 = get_gen_0(
            data_path + "/chm" + chm, population_map_file, gen_0_sets)

        # add it
        X_train1_raw = np.concatenate([X_train1_raw, X_train1_raw_gen_0])
        labels_train1_raw = np.concatenate(
            [labels_train1_raw, y_train1_raw_gen_0])
        X_train2_raw = np.concatenate([X_train2_raw, X_train2_raw_gen_0])
        labels_train2_raw = np.concatenate(
            [labels_train2_raw, y_train2_raw_gen_0])

        # delete it
        del X_train1_raw_gen_0, y_train1_raw_gen_0, X_train2_raw_gen_0, y_train2_raw_gen_0

    # reshape according to window size
    X_train1, labels_window_train1 = data_process(X_train1_raw,
                                                  labels_train1_raw,
                                                  window_size, missing)
    X_train2, labels_window_train2 = data_process(X_train2_raw,
                                                  labels_train2_raw,
                                                  window_size, missing)
    X_val, labels_window_val = data_process(X_val_raw, labels_val_raw,
                                            window_size, missing)

    del X_train1_raw, X_train2_raw, X_val_raw, labels_train1_raw, labels_train2_raw, labels_val_raw

    # necessary arguments for model
    snp_pos = np.loadtxt(position_map_file, delimiter='\n').astype("int")
    snp_ref = np.loadtxt(reference_map_file, delimiter='\n', dtype=str)
    pop_order = np.genfromtxt(population_map_file, dtype="str")
    chm_len = len(snp_pos)
    num_anc = len(pop_order)

    # ------------------ Train model ------------------
    # init, train, evaluate and save model
    if verbose:
        print("Initializing XGMix model and training...")
    model = XGMIX(chm_len,
                  window_size,
                  smooth_size,
                  num_anc,
                  snp_pos,
                  snp_ref,
                  pop_order,
                  calibrate=calibrate,
                  cores=n_cores)
    model.train(X_train1,
                labels_window_train1,
                X_train2,
                labels_window_train2,
                X_val,
                labels_window_val,
                retrain_base=retrain_base,
                verbose=verbose)

    # evaluate model
    analysis_path = join_paths(model_repo, "analysis", verb=False)
    CM(labels_window_val.ravel(),
       model.predict(X_val).ravel(), pop_order, analysis_path, verbose)
    pickle.dump(model, open(model_path, "wb"))

    return model