def polish_pickle(feature_params, test_accuracy):
    filename = tr.make_pickle_filename('trained_models', feature_params,
                                       test_accuracy)
    data = tr.load_data(filename)
    data['feature_params'] = feature_params
    tr.save_data(data, filename)

    # Check feature_params were saved correctly.
    data = tr.load_data(filename)
    print('feature_params:', data['feature_params'].str())
Esempio n. 2
0
def best_feature(temporal_rel):
    """Look at the accuracies for all features in isolation."""
    features = [
        "pos", "stem", "aspect", "tense", "distance", "similarity", "polarity",
        "modality"
    ]

    accuracies = []
    recall = []
    precision = []

    for feature in features:
        X, y = load_data(True, temporal_rel, features=[feature])
        X_train, X_test, y_train, y_test = split(X, y)

        rf = RandomForestClassifier(n_jobs=2, n_estimators=100)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_test)

        accuracies.append({feature: f1_score(y_test, y_pred)})
        recall.append({feature: recall_score(y_test, y_pred)})
        precision.append({feature: precision_score(y_test, y_pred)})
        print "Done with feature"

    # Add all features
    X, y = load_data(True, temporal_rel)
    X_train, X_test, y_train, y_test = split(X, y)

    rf = RandomForestClassifier(n_jobs=2, n_estimators=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    accuracies.append({"all": f1_score(y_test, y_pred)})
    recall.append({"all": recall_score(y_test, y_pred)})
    precision.append({"all": precision_score(y_test, y_pred)})
    features.append("all")

    data = [x.values()[0] for x in accuracies]

    if temporal_rel == None:
        plot("best_feature_weighted.jpg", "feature", "f1_score", data,
             features)
    else:
        plot("best_feature_" + str(temporal_rel) + ".jpg", "feature",
             "f1_score", data, features)
    print recall
    print precision
Esempio n. 3
0
def get_distance_data(data, temporal_rel):
    """Extracts the distance feature into the following data structure which will be returned: [{distance : classified_right?}, ...]"""
    X, y, distance = load_data(True, temporal_rel, distance=True)
    X_train, X_test, y_train, y_test, distance_train, distance_test = split(
        X, y, distance)

    rf = RandomForestClassifier(n_jobs=2, n_estimators=100)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_test)

    # Make array with elements like this {distance : [TruePositive?, TrueNegative?, FalsePositive?, FalseNegative?]}
    for i in range(len(X_test)):
        if y_test[i] == y_pred[i] and y_test[i] == 1:
            # True positive
            data.append({distance_test[i]: [True, False, False, False]})
        elif y_test[i] == y_pred[i] and y_test[i] == 0:
            # True negative
            data.append({distance_test[i]: [False, True, False, False]})
        elif y_test[i] != y_pred[i] and y_pred[i] == 1:
            # False positive
            data.append({distance_test[i]: [False, False, True, False]})
        elif y_test[i] != y_pred[i] and y_pred[i] == 0:
            # False negative
            data.append({distance_test[i]: [False, False, False, True]})
Esempio n. 4
0
def main():
    """Argument parser for making G2P predictions"""
    parser = argparse.ArgumentParser()
    parser.add_argument('pron_path',
                        default='./data/prondict_ice.txt',
                        nargs='?')
    parser.add_argument(
        'words',
        default=['adolfsdóttir', 'skynsemi', 'uppvaxtarskilyrði'],
        nargs='?')
    parser.add_argument('exp_name', default='g2p_ice', nargs='?')
    parser.add_argument('emb_dim', default=500, nargs='?')
    parser.add_argument('hidden_dim', default=500, nargs='?')
    parser.add_argument('cuda', default=True, nargs='?')
    parser.add_argument('seed', default=1337, nargs='?')
    parser.add_argument('result_dir', default='./results', nargs='?')
    parser.add_argument('data_splits', default=(0.9, 0.05, 0.05), nargs='?')
    args = parser.parse_args()

    exp_dir = os.path.join(args.result_dir, args.exp_name)
    ckp_path = os.path.join(exp_dir, 'mdl.ckpt')

    full_ds, _ = load_data(args.pron_path, args.data_splits, **vars(args))
    model = load_model(full_ds.num_graphemes, full_ds.num_phonemes, ckp_path,
                       **vars(args))

    for word in args.words:
        print(word)
        predict(model, word, full_ds)
Esempio n. 5
0
def get_data(dataset, data_dir, batch_size, test_batch_size):
    '''
    get data for imagenet
    '''
    nThread = 1
    pin = True  # for cuda device
    traindir = os.path.join(data_dir, 'train')
    valdir = os.path.join(data_dir, 'validation')
    print('train_dir is ', traindir)
    dataset, dataset_test, train_sampler, test_sampler = load_data(
        traindir, valdir, False,
        get_world_size() > 1)
    train_loader = torch.utils.data.DataLoader(dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               num_workers=nThread,
                                               pin_memory=True)

    val_loader = torch.utils.data.DataLoader(dataset_test,
                                             batch_size=test_batch_size,
                                             sampler=test_sampler,
                                             num_workers=nThread,
                                             pin_memory=True)
    criterion = torch.nn.CrossEntropyLoss()

    return train_loader, val_loader, criterion
def eval_mode_batch(output_tags, confidences, cities):
    tagged_data, identifier = load_data(output_tags)
    num_tags = len(int2tags) - 1

    assert len(tagged_data) == len(confidences)
    for i in range(len(tagged_data)):
        sentence = tagged_data[i][0]
        tags     = tagged_data[i][1]
        tag_confs = confidences[i]
        ident = identifier[i]

        gold_ents = ident.split(',')[:num_tags] #Throw away title


        output_pred_line, entity_confidences, entity_cnts = predict_mode(sentence, tags, tag_confs, cities)
        predictions = output_pred_line.split(" ### ")

        # Evaluate the predictions. 
        evaluateArticle(predictions, gold_ents)

    print "------------\nEvaluation Stats: (Precision, Recall, F1):"
    for tag in GOLD:
        prec = CORRECT[tag]/PRED[tag]
        rec = CORRECT[tag]/GOLD[tag]
        f1 = (2*prec*rec)/(prec+rec)
        print tag, prec, rec, f1, "########", CORRECT[tag], PRED[tag], GOLD[tag]
def main(_):
    _, dev, test, samples, _, _, data = load_data(FLAGS.pop_prep_dir,
                                                  FLAGS.pop_dataset,
                                                  FLAGS.pop_prep_name,
                                                  FLAGS.pop_debug)
    sorted_items = sort_items_by_popularity(samples)
    print_most_popular_items(data, sorted_items)

    pop_ranking = np.ones((1, len(sorted_items)))
    for i, (item_idx, _) in enumerate(sorted_items):
        pop_ranking[0, item_idx] = len(sorted_items) - i

    random_items = 100
    for title, split in zip(["DEV", "TEST"], [dev, test]):
        rank_all, rank_random = random_eval(pop_ranking,
                                            split,
                                            samples,
                                            num_rand=random_items)
        metric_all, metric_random = rank_to_metric_dict(
            rank_all), rank_to_metric_dict(rank_random)

        print(f"Result for {title.upper()}")
        print(f"Random items {random_items}: " +
              " ".join((f"{k}: {v:.2f}" for k, v in metric_random.items())))
        print("All items: " + " ".join((f"{k}: {v:.2f}"
                                        for k, v in metric_all.items())))
Esempio n. 8
0
def check_model():

    # Creating the session
    session, img_length, img_height, y_pred_cls, x, weights1, weights2, conv1, conv2 = load_graph(
        True, "../model/two_d_cnn_proj.ckpt")

    # object names
    object_names = object_names_func()
    list_of_objects = list(range(29))
    list_of_objects.remove(22)

    # plotting the weights
    plot_conv_weights(session.run(weights1), 0, '../model/weights1.png')
    plot_conv_weights(session.run(weights2), 0, '../model/weights2.png')

    # Select some images after reading in the data
    train_input_encode, train_out_encode, test_input_encode, test_out_encode = load_data(
        "../data")

    image = train_input_encode[1000]
    test_object = object_names[list_of_objects[np.argmax(
        train_out_encode[1000])]]

    feed_dict = {x: [image]}

    # Calculate and retrieve the output values of the layer1
    values = session.run(conv1, feed_dict=feed_dict)
    plot_conv_layer(values, '../model/{}_1.png'.format(test_object))

    # Calculate and retrieve the output values of the layer2
    values = session.run(conv2, feed_dict=feed_dict)
    plot_conv_layer(values, '../model/{}_2.png'.format(test_object))

    print("Object = {}".format(test_object))
Esempio n. 9
0
def eval_mode_batch(output_tags, confidences, cities):
    tagged_data, identifier = load_data(output_tags)
    num_tags = len(int2tags) - 1

    assert len(tagged_data) == len(confidences)
    for i in range(len(tagged_data)):
        sentence = tagged_data[i][0]
        tags = tagged_data[i][1]
        tag_confs = confidences[i]
        ident = identifier[i]

        gold_ents = ident.split(',')[:num_tags]  #Throw away title

        output_pred_line, entity_confidences, entity_cnts = predict_mode(
            sentence, tags, tag_confs, cities)
        predictions = output_pred_line.split(" ### ")

        # Evaluate the predictions.
        evaluateArticle(predictions, gold_ents)

    print "------------\nEvaluation Stats: (Precision, Recall, F1):"
    for tag in GOLD:
        prec = CORRECT[tag] / PRED[tag]
        rec = CORRECT[tag] / GOLD[tag]
        f1 = (2 * prec * rec) / (prec + rec)
        print tag, prec, rec, f1, "########", CORRECT[tag], PRED[tag], GOLD[
            tag]
Esempio n. 10
0
def run(data_name, num_train, num_test, Phi,
        depth, widths, lc_w_range, shift_w_range,
        optim_name, optim_args,
        num_epochs, batch_size, chkpt_freq):
    id = get_info()
    identifier_id = '%s%s' % (identifier, id)
    train_data, test_data = load_data(data_name, num_train, num_test)
    train_ll, test_ll = load_log_ll(data_name, num_train, num_test)

    print('Computing ground truth manually because tagged log likelihood is wrong')
    from phi_listing import ClaytonPhi
    gt_phi = ClaytonPhi(torch.tensor(5.))
    cop = Copula(gt_phi)
    train_ll = -torch.log(cop(train_data, mode='pdf'))
    test_ll = -torch.log(cop(test_data, mode='pdf'))

    print('train_ll', torch.mean(train_ll))
    print('test_ll', torch.mean(test_ll))

    print('Train ideal ll:', torch.mean(train_ll))
    print('Test ideal ll:', torch.mean(test_ll))

    phi = Phi(depth, widths, lc_w_range, shift_w_range)
    net = Copula(phi)
    expt(train_data, test_data, net, optim_name,
         optim_args, identifier_id, num_epochs, batch_size, chkpt_freq)
Esempio n. 11
0
def dummy():
    config = train_config

    data_train = load_data(config, 'train')
    config['input_dim'] = data_train.input_[0].shape[-1]
    config['output_dim'] = data_train.target[0].shape[-1]

    data_train.reshuffle()

    rnn_model_class, placeholders = get_model_and_placeholders(config)

    rnn_model = RNNModel(config, placeholders, mode='training')

    # loop through all training batches
    for i, batch in enumerate(data_train.all_batches()):
        # get the feed dict for the current batch
        feed_dict = rnn_model.get_feed_dict(batch)

        for sequence_mask in batch.mask:
            if np.sum(sequence_mask) < 35:
                print('found it {0}'.format(np.sum(sequence_mask)))

                input_padded, target_padded = batch.get_padded_data()

                mse = mean_squared_error(input_padded[0], target_padded[0])
                mse2 = mean_squared_error(input_padded[1], target_padded[1])
Esempio n. 12
0
def different_number_of_trees(temporal_rel,
                              start=5,
                              end=800,
                              steps=20,
                              rerunning=5):
    """How does the accuracy change for different amounts of trees. Plots to different_number_of_trees.jpg"""
    X, y = load_data(True, temporal_rel)
    X_train, X_test, y_train, y_test = split(X, y)

    # Since accuracies for small amounts of trees differ a lot, we take the average over many tries
    many_accuracies = []
    many_recall = []
    many_precision = []
    for x in range(rerunning):
        accuracies = []
        recall = []
        precision = []
        for i in range(start, end, steps):
            rf = RandomForestClassifier(n_jobs=2, n_estimators=i)
            rf.fit(X_train, y_train)

            y_pred = rf.predict(X_test)

            accuracies.append(f1_score(y_test, y_pred))
            recall.append(recall_score(y_test, y_pred))
            precision.append(precision_score(y_test, y_pred))

        many_accuracies.append(accuracies)
        many_recall.append(recall)
        many_precision.append(precision)

    final_accuracies = []
    final_recall = []
    final_precision = []
    # Calculate the mean
    for i in range(len(many_accuracies[0])):
        mean = []
        mean_recall = []
        mean_precision = []
        for j in range(len(many_accuracies)):
            mean.append(many_accuracies[j][i])
            mean_recall.append(many_recall[j][i])
            mean_precision.append(many_precision[j][i])
        final_accuracies.append(np.mean(mean))
        final_recall.append(np.mean(mean_recall))
        final_precision.append(np.mean(mean_precision))

    # xticks
    xticks = range(start, end, steps)

    if temporal_rel == None:
        plot("different_number_of_trees_weighted.jpg", "number_of_trees",
             "f1_score", final_accuracies, xticks)
    else:
        plot("different_number_of_trees_" + str(temporal_rel) + ".jpg",
             "number_of_trees", "f1_score", final_accuracies, xticks)
    print final_recall
    print final_precision
Esempio n. 13
0
def main(repo_path):
    test_csv_path = repo_path / "data/prepared/test.csv"
    test_data, labels = load_data(test_csv_path)
    model = load(repo_path / "model/model.joblib")
    predictions = model.predict(test_data)
    accuracy = accuracy_score(labels, predictions)
    metrics = {"accuracy": accuracy}
    accuracy_path = repo_path / "metrics/accuracy.json"
    accuracy_path.write_text(json.dumps(metrics))
Esempio n. 14
0
def learning_rate(temporal_rel, k=20, new=False):
    """Splits the dataset into k pieces and builds a series out of those k pieces. For every partial sum the accuracy will be calculated to obtain the learning rate. Plots the data to learning_rate.jpg"""
    X, y = load_data(new, temporal_rel)

    X_train, X_test, y_train, y_test = split(X, y)

    # Splitting the training set up into k pieces
    len_piece = len(X_train) / k
    X_pieces = []
    y_pieces = []
    data_count = []
    recall = []
    precision = []

    for i in range(k):
        data_count.append((i + 1) * len_piece)
        offset = i * len_piece

        X_piece = X[offset:][:len_piece]
        y_piece = y[offset:][:len_piece]

        X_pieces.append(X_piece)
        y_pieces.append(y_piece)

    # Building series (from 0 to k) for those pieces
    X_series = []
    y_series = []

    for i in range(k):
        X_sum = X_pieces[0]
        y_sum = y_pieces[0]
        for j in range(i):
            X_sum = np.concatenate((X_sum, X_pieces[j]))
            y_sum = np.concatenate((y_sum, y_pieces[j]))

        X_series.append(X_sum)
        y_series.append(y_sum)

    # Calculate the accuracy for each partial sum
    rf = RandomForestClassifier(n_jobs=2, n_estimators=1000)
    accuracies = []

    for partial_X, partial_y in zip(X_series, y_series):
        rf.fit(partial_X, partial_y)
        y_pred = rf.predict(X_test)
        accuracies.append(f1_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))

    if temporal_rel == None:
        plot("learning_rate_weighted.jpg", "data_count", "f1_score",
             accuracies, data_count)
    else:
        plot("learning_rate_" + str(temporal_rel) + ".jpg", "data_count",
             "f1_score", accuracies, data_count)
    print recall
    print precision
Esempio n. 15
0
def main():
	args = get_args()
	weight_path = args.weight_path
	if not os.path.exists(RESPATH):
		os.makedirs(RESPATH)
	viddata, auddata = train.load_data(DATAPATH)
	net_out = auddata.shape[1]
	viddata, auddata_norm, auddata_means, auddata_stds = standardize_data(viddata, auddata)
	print(net_out)
def union_vs_intersected_relations():
    """Looking at the difference in accuracy when all relations (union) are used vs. all events are used which the annotators have in common (intersected)."""
    X_union, y_union = load_data(new=True, annotations="union")
    X_intersected, y_intersected = load_data(new=True, annotations="intersected")

    X_u_train, X_u_test, y_u_train, y_u_test = split(X_union, y_union)
    X_i_train, X_i_test, y_i_train, y_i_test = split(X_intersected, y_intersected)

    rf_u = RandomForestClassifier(n_jobs=2, n_estimators=100)
    rf_u.fit(X_u_train, y_u_train)

    rf_i = RandomForestClassifier(n_jobs=2, n_estimators=100)
    rf_i.fit(X_i_train, y_i_train)

    y_u_pred = rf_u.predict(X_u_test)
    y_i_pred = rf_i.predict(X_i_test)
    print "Union: " + str(f1_score(y_u_test, y_u_pred))
    print "Intersected: " + str(f1_score(y_i_test, y_i_test))
def best_feature(temporal_rel):
    """Look at the accuracies for all features in isolation."""
    features = ["pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality"]

    accuracies = []
    recall = []
    precision = []

    for feature in features:
        X, y = load_data(True, temporal_rel, features=[feature])
        X_train, X_test, y_train, y_test = split(X, y)

        rf = RandomForestClassifier(n_jobs=2, n_estimators=100)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_test)

        accuracies.append({feature : f1_score(y_test, y_pred)})
        recall.append({feature : recall_score(y_test, y_pred)})
        precision.append({feature : precision_score(y_test, y_pred)})
        print "Done with feature"

    # Add all features
    X, y = load_data(True, temporal_rel)
    X_train, X_test, y_train, y_test = split(X, y)

    rf = RandomForestClassifier(n_jobs=2, n_estimators=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    accuracies.append({"all": f1_score(y_test, y_pred)})
    recall.append({"all": recall_score(y_test, y_pred)})
    precision.append({"all": precision_score(y_test, y_pred)})
    features.append("all")


    data = [x.values()[0] for x in accuracies]

    if temporal_rel == None:
        plot("best_feature_weighted.jpg", "feature", "f1_score", data, features)
    else:
        plot("best_feature_"+str(temporal_rel)+".jpg", "feature", "f1_score", data, features)
    print recall
    print precision
Esempio n. 18
0
def mean_std(train_dir, val_dir):
    train_loader, val_loader = load_data(train_dir, val_dir)

    mean_train, std_train = process(train_loader)
    mean_val, std_val = process(val_loader)

    print('mean_train: ', mean_train)
    print('std_train: ', std_train)
    print('mean_val: ', mean_val)
    print('std_val: ', std_val)
def learning_rate(temporal_rel, k=20, new=False):
    """Splits the dataset into k pieces and builds a series out of those k pieces. For every partial sum the accuracy will be calculated to obtain the learning rate. Plots the data to learning_rate.jpg"""
    X, y = load_data(new, temporal_rel)

    X_train, X_test, y_train, y_test = split(X, y)

    # Splitting the training set up into k pieces
    len_piece = len(X_train)/k
    X_pieces = []
    y_pieces = []
    data_count = []
    recall = []
    precision = []

    for i in range(k):
        data_count.append((i+1)*len_piece)
        offset = i*len_piece

        X_piece = X[offset:][:len_piece]
        y_piece = y[offset:][:len_piece]

        X_pieces.append(X_piece)
        y_pieces.append(y_piece)

    # Building series (from 0 to k) for those pieces
    X_series = []
    y_series = []

    for i in range(k):
        X_sum = X_pieces[0]
        y_sum = y_pieces[0]
        for j in range(i):
            X_sum = np.concatenate((X_sum, X_pieces[j]))
            y_sum = np.concatenate((y_sum, y_pieces[j]))

        X_series.append(X_sum)
        y_series.append(y_sum)

    # Calculate the accuracy for each partial sum
    rf = RandomForestClassifier(n_jobs=2, n_estimators=1000)
    accuracies = []

    for partial_X, partial_y in zip(X_series, y_series):
        rf.fit(partial_X, partial_y)
        y_pred = rf.predict(X_test)
        accuracies.append(f1_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))

    if temporal_rel == None:
        plot("learning_rate_weighted.jpg", "data_count", "f1_score", accuracies, data_count)
    else:
        plot("learning_rate_"+str(temporal_rel)+".jpg", "data_count", "f1_score", accuracies, data_count)
    print recall
    print precision
Esempio n. 20
0
def union_vs_intersected_relations():
    """Looking at the difference in accuracy when all relations (union) are used vs. all events are used which the annotators have in common (intersected)."""
    X_union, y_union = load_data(new=True, annotations="union")
    X_intersected, y_intersected = load_data(new=True,
                                             annotations="intersected")

    X_u_train, X_u_test, y_u_train, y_u_test = split(X_union, y_union)
    X_i_train, X_i_test, y_i_train, y_i_test = split(X_intersected,
                                                     y_intersected)

    rf_u = RandomForestClassifier(n_jobs=2, n_estimators=100)
    rf_u.fit(X_u_train, y_u_train)

    rf_i = RandomForestClassifier(n_jobs=2, n_estimators=100)
    rf_i.fit(X_i_train, y_i_train)

    y_u_pred = rf_u.predict(X_u_test)
    y_i_pred = rf_i.predict(X_i_test)
    print "Union: " + str(f1_score(y_u_test, y_u_pred))
    print "Intersected: " + str(f1_score(y_i_test, y_i_test))
def different_number_of_trees(temporal_rel, start=5, end=800, steps=20, rerunning=5):
    """How does the accuracy change for different amounts of trees. Plots to different_number_of_trees.jpg"""
    X, y = load_data(True, temporal_rel)
    X_train, X_test, y_train, y_test = split(X, y)

    # Since accuracies for small amounts of trees differ a lot, we take the average over many tries
    many_accuracies = []
    many_recall = []
    many_precision = []
    for x in range(rerunning):
        accuracies = []
        recall = []
        precision = []
        for i in range(start, end, steps):
            rf = RandomForestClassifier(n_jobs=2, n_estimators=i)
            rf.fit(X_train, y_train)

            y_pred = rf.predict(X_test)

            accuracies.append(f1_score(y_test, y_pred))
            recall.append(recall_score(y_test, y_pred))
            precision.append(precision_score(y_test, y_pred))

        many_accuracies.append(accuracies)
        many_recall.append(recall)
        many_precision.append(precision)

    final_accuracies = []
    final_recall = []
    final_precision = []
    # Calculate the mean
    for i in range(len(many_accuracies[0])):
        mean = []
        mean_recall = []
        mean_precision = []
        for j in range(len(many_accuracies)):
            mean.append(many_accuracies[j][i])
            mean_recall.append(many_recall[j][i])
            mean_precision.append(many_precision[j][i])
        final_accuracies.append(np.mean(mean))
        final_recall.append(np.mean(mean_recall))
        final_precision.append(np.mean(mean_precision))

    # xticks
    xticks = range(start, end, steps)

    if temporal_rel == None:
        plot("different_number_of_trees_weighted.jpg", "number_of_trees", "f1_score", final_accuracies, xticks)
    else:
        plot("different_number_of_trees_"+str(temporal_rel)+".jpg", "number_of_trees", "f1_score", final_accuracies, xticks)
    print final_recall
    print final_precision
Esempio n. 22
0
def main(trained_model,
         test_file,
         viterbi,
         output_tags="output.tag",
         output_predictions="output.pred"):
    test_data, identifier = load_data(testing_file)

    evaluate = True

    # extract features
    if not "crf" in trained_model:
        if not isinstance(trained_model, list):
            with open(trained_model, 'rb') as frb:
                clf, previous_n, next_n, word_vocab, other_features = pickle.load(
                    frb)
        else:
            clf, previous_n, next_n, word_vocab, other_features = trained_model

    tic = time.clock()

    with open(output_tags, 'w') as fw:
        confidences = []
        for i in range(len(test_data) + len(identifier)):
            if i % 2 == 1:
                if "crf" in trained_model:
                    y, tmp_conf = crf.predict(test_data[i / 2][0],
                                              trained_model)
                    fw.write(" ".join([
                        test_data[i / 2][0][j] + "_" + y[j]
                        for j in range(len(test_data[i / 2][0]))
                    ]))
                else:
                    y, tmp_conf = predict_tags_n(viterbi, previous_n, next_n,
                                                 clf, test_data[i / 2][0],
                                                 word_vocab, other_features)
                    fw.write(" ".join([
                        test_data[i / 2][0][j] + "_" + int2tags[int(y[j])]
                        for j in range(len(test_data[i / 2][0]))
                    ]))
                assert (len(y) == len(tmp_conf))
                confidences.append(tmp_conf)
                fw.write("\n")
            else:
                fw.write(identifier[i / 2])
                fw.write("\n")

        print(time.clock() - tic)

        if evaluate:
            eval_mode_batch(output_tags, confidences, helper.cities)
        else:
            predict_mode_batch(output_tags, output_predictions, helper.cityies)
def predict_mode_batch(output_tags, output_predictions, cities):

    tagged_data, identifier = load_data(output_tags)

    f = open(output_predictions,'w')
    for i in range(len(tagged_data)+len(identifier)):
        if i%2 == 1:
            f.write(predict_mode(tagged_data[i/2][0], tagged_data[i/2][1], cities))
            f.write("\n")
        else:
            f.write(identifier[i/2])
            f.write("\n")
    return
Esempio n. 24
0
def main():
    args = get_args()
    weight_path = args.weight_path
    if not os.path.exists(RESPATH):
        os.makedirs(RESPATH)
    (Xtr, Ytr), (Xte, Yte) = train.load_data(DATAPATH)
    net_out = Ytr.shape[1]
    Xtr, Ytr_norm, Xte, Yte_norm, Y_means, Y_stds = train.standardize_data(
        Xtr, Ytr, Xte, Yte)
    model = train.build_model(net_out)
    model.compile(loss='mse', optimizer='adam')
    model.load_weights(weight_path)
    Ytr_pred, Yte_pred = train.predict(model, Xtr, Xte, Y_means, Y_stds)
    train.savedata(Ytr, Ytr_pred, Yte, Yte_pred, respath=RESPATH)
Esempio n. 25
0
def predict_mode_batch(output_tags, output_predictions, cities):
    tagged_data, identifier = load_data(output_tags)

    with open(output_predictions, 'w') as f:
        for i in range(len(tagged_data) + len(identifier)):
            if i % 2 == 1:
                f.write(
                    predict_mode(tagged_data[i / 2][0], tagged_data[i / 2][1],
                                 cities))
                f.write('\n')
            else:
                f.write(identifier[i / 2])
                f.write('\n')
    return
Esempio n. 26
0
 def test_apply_model(self):
   """Tests if we can apply a model to a small test dataset."""
   checkpoint_path = os.path.join(os.path.dirname(__file__), 'va0.1',
                                  't0f50.ckpt')
   file_pattern = os.path.join(os.path.dirname(__file__), 'testdata',
                               'Con100_η0.1N300L5_100')
   predictions = train.apply_model(checkpoint_path=checkpoint_path,
                                   file_pattern=file_pattern,
                                   time_index=0)
   data = train.load_data(file_pattern, 0)
   targets = data[0].targets
   # correlation_value = np.corrcoef(predictions[0], targets)[0, 1]
   print('predictions:',predictions)
   print('targets:',targets) 
Esempio n. 27
0
def main():
    args = get_args()
    weight_path = args.weight_path
    if not os.path.exists(RESPATH):
        os.makedirs(RESPATH)
    viddata, auddata = train.load_data(DATAPATH)
    net_out = auddata.shape[1]
    viddata, auddata_norm, auddata_means, auddata_stds = standardize_data(
        viddata, auddata)
    model = train.build_model(net_out)
    model.compile(loss='mse', optimizer='adam')
    model.load_weights(weight_path)
    aud_pred = train.predict(model, viddata, auddata_means, auddata_stds)
    np.save(join(RESPATH, 'aud_pred.npy'), aud_pred)
Esempio n. 28
0
def run(data_name, num_train, num_test, Phi, depth, widths, lc_w_range,
        shift_w_range, optim_name, optim_args, num_epochs, batch_size,
        chkpt_freq):
    id = get_info()
    identifier_id = '%s%s' % (identifier, id)
    train_data, test_data = load_data(data_name, num_train, num_test)
    train_ll, test_ll = load_log_ll(data_name, num_train, num_test)

    print('Train ideal ll:', torch.mean(train_ll))
    print('Test ideal ll:', torch.mean(test_ll))

    phi = Phi(depth, widths, lc_w_range, shift_w_range)
    net = Copula(phi)
    expt(train_data, test_data, net, optim_name, optim_args, identifier_id,
         num_epochs, batch_size, chkpt_freq)
Esempio n. 29
0
def main():
    data, words = load_data()
    model = models.load_model('model.couplets.h5')
    c = 13
    x = np.zeros((1, 2 * c), dtype='uint32')
    for i in range(2 * c):
        s = max(i - c, 0)
        probas = model.predict(x[:, s:s + c], verbose=0)
        probas = probas.astype('float64')
        probas /= probas.sum()
        probas = np.random.multinomial(1, probas[0], 1)
        char = np.argmax(probas)
        x[0, i] = char
        char = words[char]
        print(char)
def eval_mode_batch(output_tags, confidences, cities):
    tagged_data, identifier = load_data(output_tags)
    num_tags = len(int2tags) - 1
    
    correct = [0]* num_tags
    guessed = [0] * num_tags
    gold_correct = [0] * num_tags

    assert len(tagged_data) == len(confidences)
    for i in range(len(tagged_data)):
        sentence = tagged_data[i][0]
        tags     = tagged_data[i][1]
        tag_confs = confidences[i]
        ident = identifier[i]

        gold_ents = ident.split(',')[:num_tags] #Throw away title


        output_pred_line, entity_confidences, entity_cnts = predict_mode(sentence, tags, tag_confs, cities)
        predictions = output_pred_line.split(" ### ")

        if not len(gold_ents) == len(predictions):
            print 'ident', ident
            print 'gold_ents', gold_ents
            raw_input()
            continue

        for index in range(len(gold_ents)):            
            match = evaluatePrediction(predictions[index], gold_ents[index])
            debugCity = False
            if index == 3 and debugCity:
                print 'predictions[index]', predictions[index]
                print 'gold_ents[index]', gold_ents[index]
                print 'match', match
                raw_input()
            if match == 'skip':
                continue
            else:
                gold_correct[index] += 1
                if match == "no_predict":
                    continue
                if match == 1:
                    correct[index] += 1
                guessed[index] += 1

    helper.printScores(correct, guessed, gold_correct, False)
Esempio n. 31
0
def yield_chars():
    text, char_indices, indices_char = load_data()
    model = build_model(indices_char)
    load_latest_model(model)

    start_index = random.randint(0, len(text) - MAXLEN - 1)
    gen = sample_from_model(text, start_index, char_indices, model, indices_char)

    next_letter_upper = False
    for c in gen:
        if next_letter_upper and c.upper() != c:
            yield c.upper()
            next_letter_upper = False
        else:
            yield c
        if c == '.':
            next_letter_upper = True
def eval_mode_batch(output_tags, confidences, cities):
    tagged_data, identifier = load_data(output_tags)
    num_tags = len(int2tags) - 1

    correct = [0] * num_tags
    guessed = [0] * num_tags
    gold_correct = [0] * num_tags

    assert len(tagged_data) == len(confidences)
    for i in range(len(tagged_data)):
        sentence = tagged_data[i][0]
        tags = tagged_data[i][1]
        tag_confs = confidences[i]
        ident = identifier[i]

        gold_ents = ident.split(',')[:num_tags]  #Throw away title

        output_pred_line, entity_confidences, entity_cnts = predict_mode(
            sentence, tags, tag_confs, cities)
        predictions = output_pred_line.split(" ### ")

        if not len(gold_ents) == len(predictions):
            print 'ident', ident
            print 'gold_ents', gold_ents
            raw_input()
            continue

        for index in range(len(gold_ents)):
            match = evaluatePrediction(predictions[index], gold_ents[index])
            debugCity = False
            if index == 3 and debugCity:
                print 'predictions[index]', predictions[index]
                print 'gold_ents[index]', gold_ents[index]
                print 'match', match
                raw_input()
            if match == 'skip':
                continue
            else:
                gold_correct[index] += 1
                if match == "no_predict":
                    continue
                if match == 1:
                    correct[index] += 1
                guessed[index] += 1

    helper.printScores(correct, guessed, gold_correct, False)
Esempio n. 33
0
def yield_chars():
    text, char_indices, indices_char = load_data()
    model = build_model(indices_char)
    load_latest_model(model)

    start_index = random.randint(0, len(text) - MAXLEN - 1)
    gen = sample_from_model(text, start_index, char_indices, model,
                            indices_char)

    next_letter_upper = False
    for c in gen:
        if next_letter_upper and c.upper() != c:
            yield c.upper()
            next_letter_upper = False
        else:
            yield c
        if c == '.':
            next_letter_upper = True
Esempio n. 34
0
def evaluate_model(load_model):
    args = {}
    args['max_pos_embed'] = 512
    args['max_num_sentences'] = 32
    args['max_summary_length'] = 96
    args[
        'model_data_dir'] = "/home/alta/summary/pm574/summariser0/lib/model_data/"
    val_batch_size = 200  # okay for K80 & RTX 2080 ti

    if 'X_SGE_CUDA_DEVICE' in os.environ:  # to run on CUED stack machine
        print('running on the stack...')
        cuda_device = os.environ['X_SGE_CUDA_DEVICE']
        print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device))
        os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device
    else:
        # pdb.set_trace()
        print('running locally...')
        os.environ[
            "CUDA_VISIBLE_DEVICES"] = '0'  # choose the device (GPU) here
    device = 'cuda'

    val_data = load_data(args, 'val')
    val_summary = load_summary(args, 'val')

    abs_sum = AbstractiveSummariser(args, device=device)
    abs_sum.load_state_dict(torch.load(load_model))
    abs_sum.eval()  # switch to evaluation mode
    print("evaluate model: {}".format(load_model))

    vocab_size = abs_sum.decoder.linear_decoder.out_features

    with torch.no_grad():
        avg_val_loss = evaluate2(abs_sum, val_data, val_summary,
                                 val_batch_size, vocab_size, args, device)

    print(
        "============================================================================================"
    )
    print("MODEL = {}".format(load_model))
    print("VLOSS = {}".format(avg_val_loss))
    print(
        "============================================================================================"
    )
Esempio n. 35
0
def predict(dataset, model_file):
    """
    Loads a model file and predicts on the test set
    """
    classifier = cPickle.load(open(model_file))
    predict_model = theano.function(inputs=[classifier.input],
                                    outputs=classifier.y_pred)
    datasets = load_data(dataset)
    test_set_x, test_set_y = datasets[2]
    test_set_x = test_set_x.get_value()

    predicted_values = predict_model(test_set_x)
    print("Predicted values for the first 10 examples in the test test:")
    print(predicted_values[:10])

    test_model = theano.function(inputs=[classifier.input],
                                 outputs=classifier.errors(test_set_y))
    test_error = test_model(test_set_x)
    print("Test error is %f %%" % (test_error * 100))
Esempio n. 36
0
def predict(dataset, model_file):
    """
    Loads a model file and predicts on the test set
    """
    classifier = cPickle.load(open(model_file))
    predict_model = theano.function(inputs=[classifier.input],
                                    outputs=classifier.y_pred)
    datasets = load_data(dataset)
    test_set_x, test_set_y = datasets[2]
    test_set_x = test_set_x.get_value()

    predicted_values = predict_model(test_set_x)
    print("Predicted values for the first 10 examples in the test test:")
    print(predicted_values[:10])

    test_model = theano.function(inputs=[classifier.input],
                                 outputs=classifier.errors(test_set_y))
    test_error = test_model(test_set_x)
    print("Test error is %f %%" % (test_error * 100))
Esempio n. 37
0
def load_initial_train(setlist,
                       version,
                       fraction,
                       save=False,
                       hys_dir=HYS_DIR,
                       train_set_dir=TRAIN_SET_DIR,
                       grid_dir=GRID_DIR):
    if VERBOSE:
        print('Loading initial training data')
    (grids, results) = train.load_data(setlist,
                                       hys_dir,
                                       version,
                                       fraction,
                                       grid_dir=grid_dir)
    n_grids = grids.shape[0]
    indexed_results = np.zeros((n_grids, 2))
    indexed_results[:, 0] = np.reshape(np.arange(n_grids), (n_grids, ))
    indexed_results[:, 1] = results
    if save:
        out_path = os.path.join(train_set_dir, 'train_grids00.csv')
        np.savetxt(out_path, grids, fmt='%i', delimiter=',')
        out_path = os.path.join(train_set_dir, 'train_results00.csv')
        np.savetxt(out_path, indexed_results, delimiter=',')
    return (grids, indexed_results)
def main(trained_model,testing_file,viterbi,output_tags="output.tag", output_predictions="output.pred"):
    test_data, identifier = load_data(testing_file)

    evaluate = True

    ## extract features
    if not "crf" in trained_model: 
        if not isinstance(trained_model, list):
            clf, previous_n, next_n, word_vocab,other_features = pickle.load( open( trained_model, "rb" ) )
        else:
            clf, previous_n, next_n, word_vocab,other_features = trained_model

    tic = time.clock()
    f = open(output_tags,'w')
    confidences = []
    for i in range(len(test_data)+len(identifier)):
        if i%2 == 1:
            if "crf" in trained_model:
                y, tmp_conf = crf.predict(test_data[i/2][0], trained_model)
                f.write(" ".join([test_data[i/2][0][j]+"_"+y[j] for j in range(len(test_data[i/2][0]))]))
            else:
               y, tmp_conf = predict_tags_n(viterbi, previous_n,next_n, clf, test_data[i/2][0], word_vocab,other_features)
               f.write(" ".join([test_data[i/2][0][j]+"_"+int2tags[int(y[j])] for j in range(len(test_data[i/2][0]))]))
            assert(len(y) == len(tmp_conf))
            confidences.append(tmp_conf)
            f.write("\n")
        else:
            f.write(identifier[i/2])
            f.write("\n")
    #print time.clock()-tic
    f.close()
    if evaluate:
        eval_mode_batch(output_tags, confidences, helper.cities)
    else:
        predict_mode_batch(output_tags, output_predictions, helper.cities)
    return
def get_distance_data(data, temporal_rel):
    """Extracts the distance feature into the following data structure which will be returned: [{distance : classified_right?}, ...]"""
    X, y, distance = load_data(True, temporal_rel, distance=True)
    X_train, X_test, y_train, y_test, distance_train, distance_test = split(X, y, distance)

    rf = RandomForestClassifier(n_jobs=2, n_estimators=100)
    rf.fit(X_train, y_train)

    y_pred  = rf.predict(X_test)

    # Make array with elements like this {distance : [TruePositive?, TrueNegative?, FalsePositive?, FalseNegative?]}
    for i in range(len(X_test)):
        if y_test[i] == y_pred[i] and y_test[i] == 1:
            # True positive
            data.append({distance_test[i] : [True, False, False, False]})
        elif y_test[i] == y_pred[i] and y_test[i] == 0:
            # True negative
            data.append({distance_test[i] : [False, True, False, False]})
        elif y_test[i] != y_pred[i] and y_pred[i] == 1:
            # False positive
            data.append({distance_test[i] : [False, False, True, False]})
        elif y_test[i] != y_pred[i] and y_pred[i] == 0:
            # False negative
            data.append({distance_test[i] : [False, False, False, True]})
Esempio n. 40
0
def exp_raw(dtype):
  shp = (None, 3, 256, 256)
  input_var = T.tensor4('input_var', dtype = 'float32')
  psp = T.dmatrix("psp")
  network = OrderedDict()
  network['input'] = lasagne.layers.InputLayer(shape = shp, input_var = input_var)
  # network = make_vgg16(network, 'model/vgg16_weights_from_caffe.h5')
  # First conv and segmentation part
  network['conv1_1'] = lasagne.layers.Conv2DLayer(network['input'],
    num_filters = 64, filter_size = (3, 3),nonlinearity = lasagne.nonlinearities.rectify,
    W=lasagne.init.GlorotUniform())
  network['conv1_2'] = lasagne.layers.Conv2DLayer(network['conv1_1'],
    num_filters = 64, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['pool1_1'] = lasagne.layers.MaxPool2DLayer(network['conv1_2'], pool_size = (2, 2))
  network['norm1_1'] = lasagne.layers.BatchNormLayer(network['pool1_1'])

  network['conv1_3'] = lasagne.layers.Conv2DLayer(network['norm1_1'],
    num_filters = 128, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['conv1_4'] = lasagne.layers.Conv2DLayer(network['conv1_3'],
    num_filters = 128, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['pool1_2'] = lasagne.layers.MaxPool2DLayer(network['conv1_4'], pool_size = (2, 2))
  network['norm1_2'] = lasagne.layers.BatchNormLayer(network['pool1_2'])

  network['conv1_5'] = lasagne.layers.Conv2DLayer(network['norm1_2'],
    num_filters = 256, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['pool1_3'] = lasagne.layers.MaxPool2DLayer(network['conv1_5'], pool_size = (2, 2))

  network['conv1_6'] = lasagne.layers.Conv2DLayer(network['pool1_3'],
    num_filters = 256, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify)
  network['pool1_4'] = lasagne.layers.MaxPool2DLayer(network['conv1_6'], pool_size = (2, 2))

  # Perspective Transform
  network['norm2'] = lasagne.layers.BatchNormLayer(network['pool1_4'])
  # network['cast'] = CastingLayer(network['norm2'], dtype)
  theano.config.floatX = dtype 
  network['pfc2_1'] = lasagne.layers.DenseLayer(
    lasagne.layers.dropout(network['norm2'], p = 0.05),
    num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify)
  network['pfc2_2'] = lasagne.layers.DenseLayer(
    lasagne.layers.dropout(network['pfc2_1'], p=0.05),
    num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify)
  network['pfc2_3'] = lasagne.layers.DenseLayer(
    lasagne.layers.dropout(network['pfc2_2'], p=0.05),
    num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify)
  # loss target 2
  network['pfc_out'] = lasagne.layers.DenseLayer(
    lasagne.layers.dropout(network['pfc2_3'], p = 0.05),
    num_units = 8, nonlinearity = lasagne.nonlinearities.rectify)
  theano.config.floatX = 'float32'

  predict = lasagne.layers.get_output(network['pfc_out'])
  loss = T.sqrt(lasagne.objectives.squared_error(predict, psp).mean())
  paras = lasagne.layers.get_all_params(network['pfc_out'], trainable = True)
  updates = adam(loss, paras, [theano.shared(np.float32(0.0001)) for i in range(len(paras))])
  ftrain = theano.function([input_var, psp], [loss, predict], updates = updates)

  def get_inputs(meta, batch, path):
    # batchidx = [keys[i] for i in batch]
    input = np.array([read_image(path + 'patch/' + idx + '.jpg', shape = (256, 256))
      for idx in batch]).astype(np.float32)
    seg = np.array([read_image(path + 'pmask/' + idx + '.jpg', shape = (256, 256))
      for idx in batch]).astype(np.float32)
    dat = [meta[key] for key in batch]
    Ps = np.array([np.array(dat[i][0]).flatten()[0 : 8] for i in range(len(batch))])
    for P in Ps:
      P[6 : 8] = (P[6 : 8] + 1e-3) * 1e4
    return input, Ps

  path = '/home/yancz/text_generator/data/real/'
  dat, meta = load_data(path, 10000, False)
  for epoch in range(10):
    loss = 0
    trs = 0
    for batch in iterate_minibatch(dat['train'], 32, len(dat['train'])):
      inputs = get_inputs(meta, batch, path)
      l, valp = ftrain(*inputs)
      log(l)
      print(valp)
      loss += l
      trs += 1
    loss /= trs
    log('loss ' + str(epoch) + ' ' + str(l))
  return ftrain
Esempio n. 41
0
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

from matlab_port.utils import partition_data, shuffle_data
from train import load_data

X, y = load_data('data/ex4data1_conv.mat', 'numpy')
X, y = shuffle_data(X, y)
X, y, X_test, y_test = partition_data(X, y, split=.8)

classifier = LogisticRegression(C=.1)
classifier.fit(X, y)

z = classifier.predict(X_test)
print("Test accuracy: {acc:.1%}".format(acc=(sum(y_test == z) / y_test.size)))
Esempio n. 42
0
import joblib
from os import path
from train import load_data

from sklearn.metrics import accuracy_score

DIR_NAME = path.dirname(__file__)
MODELS_FOLDER = path.join('.', 'models')
EXPERIMENT_NAME = path.join(MODELS_FOLDER, 'exp_01_default')
TRANFORMER_NAME = 'tf_std_default_v0.1.pkl'
MODEL_NAME = 'model_mlp_default_v0.1.pkl'

X, y = load_data()

# load models
tf = joblib.load(path.join(EXPERIMENT_NAME, TRANFORMER_NAME))
model = joblib.load(path.join(EXPERIMENT_NAME, MODEL_NAME))

X_tf = tf.transform(X)
y_hat = model.predict(X_tf)

print('accuracy score {}'.format(accuracy_score(y, y_hat)))

# EXTRA_QUERY='(adulterated | scandal | countries | fake)'

# Shooter queries
# EXTRA_QUERY='( injured | wounded | victim )'
# EXTRA_QUERY='( suspect | shooter | identified | arrested | charged )'

if __name__ == "__main__":

    trainFile = sys.argv[1]
    saveFile = sys.argv[2]
    extra_query = sys.argv[3]

    # load data and process identifiers
    articles, identifiers = load_data(trainFile)
    identifiers_tmp = []
    titles = []
    for e in identifiers:
        e = e.split(",")
        for i in range(NUM_ENTITIES):
            try:
                e[i] = int(e[i])
                e[i] = inflect_engine.number_to_words(e[i])
            except:
                pass
        identifiers_tmp.append(e[:NUM_ENTITIES])
        titles.append(",".join(e[NUM_ENTITIES:]))
    identifiers = identifiers_tmp

    # download related files
batch_size = 16
nb_epoch_mask = 5
nb_epoch_mask_exist = 4
dropout_mask = .2
dropout_exist_mask = .15
 
basename = generate_basename(batch_size, nb_epoch_mask, nb_epoch_mask_exist, 
                             dropout_mask, dropout_exist_mask)
            
weight_load = ''
mask_filename = ''
mask_exist_filename = ''
weight_mask_filename = ''
weight_mask_exist_filename = ''

imgs_train, imgs_mask_train, imgs_patient_train, imgs_test = load_data(data_path)

def multiply_mask(mask_filename, mask_exist_filename):
    imgs_mask_test = np.load(mask_filename)
    imgs_mask_exist_test = np.load(mask_exist_filename)    
    for n in range(len(imgs_mask_exist_test)):
        if (imgs_mask_exist_test[n] == 0):
            imgs_mask_test[n,0] = 0    
    return imgs_mask_test

score_mask_train = []
score_mask_val = []
score_exist_mask_train = []
score_exist_mask_val = []

for n in range(10):
Esempio n. 45
0
def test(num_features, model_name):
    df = train.load_data()
    test_with_data(num_features, model_name, df)
    print "reload helper"
    reload(helper)
    helper.load_constants()
    print "end load helper"

    retrain =  True
    if retrain:
        num_blocks = 1
        ## num_blocks = 5
        training_file = "../data/tagged_data/EMA/train.tag"
        dev_file      = "../data/tagged_data/EMA/dev.tag"
        test_file      = "../data/tagged_data/EMA/test.tag"

        trained_model = "trained_model_crf.EMA.p"
        print "load files"
        train_data, train_identifier = train.load_data(training_file)
        test_data, test_identifier = train.load_data(dev_file)
        print "End load files"
        prev_n = 2
        next_n = 2
        print "Start Feature extract on train set"
        trainX, trainY = featureExtract(train_data,train_identifier, prev_n, next_n )
        print "Done Feature extract on train set"
        #trainX, trainY = featureExtract(dev_data, prev_n, next_n)
        print "Start Feature extract on test set"
        testX, testY = featureExtract(test_data, test_identifier, prev_n, next_n)
        print "Done Feature extract on test set"
        #testX, testY = featureExtract(train_data[split_index:], prev_n, next_n)
        trainer = trainModel(1)