Beispiel #1
0
def test_error_distribution3(# alignment files
        group_1, group_2, group_3,  # these arguments should be strings that are used as the file suffix
        # which data to use
        strand, motif_start_positions, preprocess, events_per_pos, feature_set, title,
        # training params
        learning_algorithm, train_test_split, iterations, epochs, max_samples, batch_size,
        # model params
        learning_rate, L1_reg, L2_reg, hidden_dim, model_type, model_dir=None, extra_args=None,
        # output params
        out_path="./"):
    raise NotImplementedError
    # checks and file IO
    assert(len(motif_start_positions) >= 3)
    out_file = open(out_path + title + ".tsv", 'wa')

    scores = []

    # get the whole dataset for the 3 groups
    list_of_datasets = []
    add_to_list = list_of_datasets.append

    collect_data_vectors_args = {
        "events_per_pos": events_per_pos,
        "portion": train_test_split,
        "strand": strand,
        "max_samples": max_samples,
        "feature_set": feature_set,
        "kmer_length": 6,
        "split_dataset": False,
    }
    # makes a list of tuples [(dataset_1, labels_1),..., (dataset_3, labels_3)] for group 1-3
    for n, group in enumerate((group_1, group_2, group_3)):
        dataset, labels = collect_data_vectors2(label=n,
                                                files=group,
                                                motif_starts=motif_start_positions[n],
                                                dataset_title=title + "_group{}".format(n),
                                                **collect_data_vectors_args)
        add_to_list((dataset, labels))
Beispiel #2
0
def classify_with_network3(
        # alignment files
        group_1, group_2, group_3,  # these arguments should be strings that are used as the file suffix
        # which data to use
        strand, motif_start_positions, preprocess, events_per_pos, feature_set, title,
        # training params
        learning_algorithm, train_test_split, iterations, epochs, max_samples, batch_size,
        # model params
        learning_rate, L1_reg, L2_reg, hidden_dim, model_type, model_dir=None, extra_args=None,
        # output params
        out_path="./"):
    # checks and file IO
    assert(len(motif_start_positions) >= 3)
    out_file = open(out_path + title + ".tsv", 'wa')
    if model_dir is not None:
        print("looking for model in {}".format(os.path.abspath(model_dir)))
        model_file = find_model_path(os.path.abspath(model_dir), title)
    else:
        model_file = None
    # bin to hold accuracies for each iteration
    scores = []

    collect_data_vectors_args = {
        "events_per_pos": events_per_pos,
        "portion": train_test_split,
        "strand": strand,
        "max_samples": max_samples,
        "feature_set": feature_set,
        "kmer_length": 6
    }

    for i in xrange(iterations):
        list_of_datasets = []  # [((g1, g1l), (xg1, xg1l), (tg1, tg1l)), ... ]
        add_to_list = list_of_datasets.append
        for n, group in enumerate((group_1, group_2, group_3)):
            train_set, xtrain_set, test_set = collect_data_vectors2(label=n,
                                                                    files=group,
                                                                    motif_starts=motif_start_positions[n],
                                                                    dataset_title=title + "_group{}".format(n),
                                                                    **collect_data_vectors_args)
            add_to_list((train_set, xtrain_set, test_set))

        # unpack to make things easier, list_of_datasets[group][set_idx][vector/labels]
        c_train, c_tr_labels = list_of_datasets[0][0][0], list_of_datasets[0][0][1]
        c_xtr, c_xtr_targets = list_of_datasets[0][1][0], list_of_datasets[0][1][1]
        c_test, c_test_targets = list_of_datasets[0][2][0], list_of_datasets[0][2][1]

        mc_train, mc_tr_labels = list_of_datasets[1][0][0], list_of_datasets[1][0][1]
        mc_xtr, mc_xtr_targets = list_of_datasets[1][1][0], list_of_datasets[1][1][1]
        mc_test, mc_test_targets = list_of_datasets[1][2][0], list_of_datasets[1][2][1]

        hmc_train, hmc_tr_labels = list_of_datasets[2][0][0], list_of_datasets[2][0][1]
        hmc_xtr, hmc_xtr_targets = list_of_datasets[2][1][0], list_of_datasets[2][1][1]
        hmc_test, hmc_test_targets = list_of_datasets[2][2][0], list_of_datasets[2][2][1]

        nb_c_train, nb_c_xtr, nb_c_test = len(c_train), len(c_xtr), len(c_test)
        nb_mc_train, nb_mc_xtr, nb_mc_test = len(mc_train), len(mc_xtr), len(mc_test)
        nb_hmc_train, nb_hmc_xtr, nb_hmc_test = len(hmc_train), len(hmc_xtr), len(hmc_test)

        assert(nb_c_train > 0 and nb_mc_train > 0 and nb_hmc_train > 0), "got zero training vectors"

        # level training events so that the model gets equal exposure
        tr_level = np.min([nb_c_train, nb_mc_train, nb_hmc_train])
        xtr_level = np.min([nb_c_xtr, nb_mc_xtr, nb_hmc_xtr])
        test_level = np.min([nb_c_test, nb_mc_test, nb_hmc_test])

        # log how many vectors we got
        print("{motif}: got {C} C, {mC} mC, and {hmC} hmC, training vectors, leveled to {level}"
              .format(motif=title, C=nb_c_train, mC=nb_mc_train, hmC=nb_hmc_train, level=tr_level), file=sys.stderr)
        print("{motif}: got {xC} C, {xmC} mC, and {xhmC} hmC, cross-training vectors, leveled to {xlevel}"
              .format(motif=title, xC=nb_c_xtr, xmC=nb_mc_xtr, xhmC=nb_hmc_xtr, xlevel=xtr_level), file=sys.stderr)
        print("{motif}: got {xC} C, {xmC} mC, and {xhmC} hmC, test vectors, leveled to {tstLevel}"
              .format(motif=title, xC=len(c_test), xmC=len(mc_test), xhmC=len(hmc_test),
                      tstLevel=test_level), file=sys.stderr)

        # stack the data into one object
        # training data
        training_data = stack_and_level_datasets3(c_train, mc_train, hmc_train, tr_level)
        training_labels = append_and_level_labels3(c_tr_labels, mc_tr_labels, hmc_tr_labels, tr_level)

        # cross training
        xtrain_data = stack_and_level_datasets3(c_xtr, mc_xtr, hmc_xtr, xtr_level)
        xtrain_targets = append_and_level_labels3(c_xtr_targets, mc_xtr_targets, hmc_xtr_targets, xtr_level)

        # test
        test_data = stack_and_level_datasets3(c_test, mc_test, hmc_test, test_level)
        test_targets = append_and_level_labels3(c_test_targets, mc_test_targets, hmc_test_targets, test_level)

        prc_train, prc_xtrain, prc_test = preprocess_data(training_vectors=training_data,
                                                          xtrain_vectors=xtrain_data,
                                                          test_vectors=test_data,
                                                          preprocess=preprocess)

        #if evaluate is True:
        #    all_test_data = np.vstack((xtrain_data, test_data))
        #    all_test_targets = np.append(xtrain_targets, test_targets)
        #    errors, probs = evaluate_network(all_test_data, all_test_targets, model_dir, model_type, batch_size, extra_args)
        #    return

        # shuffle data
        X, y = shuffle_and_maintain_labels(prc_train, training_labels)

        working_directory_path = "{outpath}/{title}_Models/".format(outpath=out_path, title=title)
        if not os.path.exists(working_directory_path):
            os.makedirs(working_directory_path)
        trained_model_dir = "{workingdirpath}{iteration}/".format(workingdirpath=working_directory_path,
                                                                  iteration=i)

        training_routine_args = {
            "motif": title,
            "train_data": X,
            "labels": y,
            "xTrain_data": prc_xtrain,
            "xTrain_targets": xtrain_targets,
            "learning_rate": learning_rate,
            "L1_reg": L1_reg,
            "L2_reg": L2_reg,
            "epochs": epochs,
            "batch_size": batch_size,
            "hidden_dim": hidden_dim,
            "model_type": model_type,
            "model_file": model_file,
            "trained_model_dir": trained_model_dir,
            "extra_args": extra_args
        }

        if learning_algorithm == "annealing":
            net, summary = mini_batch_sgd_with_annealing(**training_routine_args)
        else:
            net, summary = mini_batch_sgd(**training_routine_args)

        errors, probs = predict(prc_test, test_targets, training_routine_args['batch_size'], net,
                                model_file=summary['best_model'])
        errors = 1 - np.mean(errors)
        probs = zip(probs, test_targets)

        print("{0}:{1}:{2} test accuracy.".format(title, i, (errors * 100)))
        out_file.write("{}\n".format(errors))
        scores.append(errors)

        with open("{}test_probs.pkl".format(trained_model_dir), 'w') as probs_file:
            cPickle.dump(probs, probs_file)

    print(">{motif}\t{accuracy}".format(motif=title, accuracy=np.mean(scores), end="\n"), file=out_file)

    return net
Beispiel #3
0
def classify_with_network2(
        # alignment files
        group_1, group_2, group_3,
        # which data to use
        strand, motif_start_positions, preprocess, events_per_pos, feature_set, title,
        # training params
        learning_algorithm, train_test_split, iterations, epochs, max_samples, batch_size,
        # model params
        learning_rate, L1_reg, L2_reg, hidden_dim, model_type, model_dir=None, extra_args=None,
        # output params
        out_path="./"):
    print("2 way classification")
    assert(len(motif_start_positions) >= 2)
    out_file = open(out_path + title + ".tsv", 'wa')
    if model_dir is not None:
        print("looking for model in {}".format(model_dir))
        model_file = find_model_path(model_dir, title)
    else:
        model_file = None

    # bin to hold accuracies for each iteration
    scores = []

    collect_data_vectors_args = {
        "events_per_pos": events_per_pos,
        "portion": train_test_split,
        "strand": strand,
        "max_samples": max_samples,
        "feature_set": feature_set,
        "kmer_length": 6
    }

    for i in xrange(iterations):
        list_of_datasets = []  # [((g1, g1l), (xg1, xg1l), (tg1, tg1l)), ... ]
        add_to_list = list_of_datasets.append
        for n, group in enumerate((group_1, group_2)):
            train_set, xtrain_set, test_set = collect_data_vectors2(label=n,
                                                                    files=group,
                                                                    motif_starts=motif_start_positions[n],
                                                                    dataset_title=title + "_group{}".format(n),
                                                                    **collect_data_vectors_args)
            add_to_list((train_set, xtrain_set, test_set))
        # unpack list
        g1_train, g1_tr_labels = list_of_datasets[0][0][0], list_of_datasets[0][0][1]
        g1_xtr, g1_xtr_targets = list_of_datasets[0][1][0], list_of_datasets[0][1][1]
        g1_test, g1_test_targets = list_of_datasets[0][2][0], list_of_datasets[0][2][1]

        g2_train, g2_tr_labels = list_of_datasets[1][0][0], list_of_datasets[1][0][1]
        g2_xtr, g2_xtr_targets = list_of_datasets[1][1][0], list_of_datasets[1][1][1]
        g2_test, g2_test_targets = list_of_datasets[1][2][0], list_of_datasets[1][2][1]

        nb_g1_train, nb_g1_xtr, nb_g1_test = len(g1_train), len(g1_xtr), len(g1_test)
        nb_g2_train, nb_g2_xtr, nb_g2_test = len(g2_train), len(g2_xtr), len(g2_test)
        assert(nb_g1_train > 0 and nb_g2_train > 0), "got {0} group 1 training and " \
                                                     "{1} group 2 training vectors".format(nb_g1_train, nb_g2_train)

        # level training and cross-training events so that the model gets equal exposure
        tr_level = np.min([nb_g1_train, nb_g2_train])
        xtr_level = np.min([nb_g1_xtr, nb_g2_xtr])
        test_level = np.min([nb_g1_test, nb_g2_test])
        print("{motif}: got {g1} group 1 and {g2} group 2 training vectors, leveled to {level}"
              .format(motif=title, g1=nb_g1_train, g2=nb_g2_train, level=tr_level))
        print("{motif}: got {g1} group 1 and {g2} group 2 cross-training vectors, leveled to {level}"
              .format(motif=title, g1=nb_g1_xtr, g2=nb_g2_xtr, level=xtr_level))
        print("{motif}: got {g1} group 1 and {g2} group 2 test vectors, leveled to {level}"
              .format(motif=title, g1=nb_g1_test, g2=nb_g2_test, level=test_level))

        training_data = stack_and_level_datasets2(g1_train, g2_train, tr_level)
        training_labels = append_and_level_labels2(g1_tr_labels, g2_tr_labels, tr_level)

        xtrain_data = stack_and_level_datasets2(g1_xtr, g2_xtr, xtr_level)
        xtrain_targets = append_and_level_labels2(g1_xtr_targets, g2_xtr_targets, xtr_level)

        test_data = stack_and_level_datasets2(g1_test, g2_test, test_level)
        test_targets = append_and_level_labels2(g1_test_targets, g2_test_targets, test_level)

        prc_train, prc_xtrain, prc_test = preprocess_data(training_vectors=training_data,
                                                          xtrain_vectors=xtrain_data,
                                                          test_vectors=test_data,
                                                          preprocess=preprocess)

        # evaluate

        X, y = shuffle_and_maintain_labels(prc_train, training_labels)

        trained_model_dir = "{0}{1}_Models/".format(out_path, title)

        training_routine_args = {
            "motif": title,
            "train_data": X,
            "labels": y,
            "xTrain_data": prc_xtrain,
            "xTrain_targets": xtrain_targets,
            "learning_rate": learning_rate,
            "L1_reg": L1_reg,
            "L2_reg": L2_reg,
            "epochs": epochs,
            "batch_size": batch_size,
            "hidden_dim": hidden_dim,
            "model_type": model_type,
            "model_file": model_file,
            "trained_model_dir": trained_model_dir,
            "extra_args": extra_args
        }

        if learning_algorithm == "annealing":
            net, summary = mini_batch_sgd_with_annealing(**training_routine_args)
        else:
            net, summary = mini_batch_sgd(**training_routine_args)

        errors, probs = predict(prc_test, test_targets, training_routine_args['batch_size'], net,
                                model_file=summary['best_model'])
        errors = 1 - np.mean(errors)
        print("{0}: {1} test accuracy.".format(title, (errors * 100)))
        out_file.write("{}\n".format(errors))
        scores.append(errors)

        with open("{}test_probs.pkl".format(trained_model_dir), 'w') as probs_file:
            cPickle.dump(probs, probs_file)

    print(">{motif}\t{accuracy}".format(motif=title, accuracy=np.mean(scores), end="\n"), file=out_file)
    return net