Example #1
0
def compute_bootstrap(data_name):
    data = load_data.from_file(data_name, return_avg_time=True)

    classifier_to_performance = data[0][6919]

    # just for comparison:
    closed_form_mean_var, avg_times = compute_sample_maxes(
        data_name, True, True)

    c_to_means = {}
    c_to_vars = {}
    for num_samples in [50000]:
        for classifier in classifier_to_performance:
            bootstrap_means = []
            bootstrap_vars = []
            for n in range(len(classifier_to_performance[classifier])):
                cur_mean, cur_std = draw_bootstrap_samples(
                    classifier_to_performance[classifier], n + 1, num_samples)
                bootstrap_means.append(cur_mean)
                bootstrap_vars.append(cur_std)

            c_to_means[classifier] = bootstrap_means
            c_to_vars[classifier] = bootstrap_vars

            #print(closed_form[0][6919][classifier])
            #print(bootstrap_vals)
            #diffs = [closed_form[0][6919][classifier][i] - bootstrap_means[i] for i in range(len(bootstrap_means))]
            #print(sum(abs(np.asarray(diffs))))

            #print(diffs)

    import pdb
    pdb.set_trace()
def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True, length_audit=True):

    num_proc = shared_setup(learn_options, order, test)

    assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
    if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']:
        raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]')

    Xdf, Y, gene_position, target_genes = load_data.from_file(data_file, learn_options)
    learn_options['all_genes'] = target_genes

    if test:
        learn_options["order"] = 1

    if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True:
        print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)"
        for i in range(Xdf.shape[0]):
            Xdf['30mer'].iloc[i] = util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"])
        # to_keep = Xdf['30mer'].isnull() == False
        # Xdf = Xdf[to_keep]
        # gene_position = gene_position[to_keep]
        # Y = Y[to_keep]
        Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide

    if learn_options.has_key('left_right_guide_ind') and learn_options['left_right_guide_ind'] is not None:
        seq_start, seq_end, expected_length = learn_options['left_right_guide_ind']
        Xdf['30mer'] = Xdf['30mer'].apply(lambda seq: seq[seq_start:seq_end])

    feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit=pam_audit, length_audit=length_audit)
    np.random.seed(learn_options['seed'])

    return Y, feature_sets, target_genes, learn_options, num_proc
Example #3
0
def compute_sample_maxes(data_name="sst2",
                         with_replacement=True,
                         return_avg_time=False):

    data = load_data.from_file(data_name, return_avg_time=return_avg_time)
    if return_avg_time:
        avg_time = data[1]
        data = data[0]

    sample_maxes = {}
    for data_size in data:
        if data_size not in sample_maxes:
            sample_maxes[data_size] = {}
        for classifier in data[data_size]:
            sample_maxes[data_size][classifier] = sample_max(
                data[data_size][classifier], with_replacement)
    #import pdb; pdb.set_trace()
    if return_avg_time:
        return sample_maxes, avg_time
    else:
        return sample_maxes
Example #4
0
def main():
    data_name = "sst5"  #["ag_news", "imdb", "hatespeech_10k"]:

    unformatted_data = load_data.from_file(data_name, True)[1]

    data = format_data(unformatted_data)

    import pdb
    pdb.set_trace()

    classifiers = expected_max_cond_n.get_classifiers(data)
    for data_size in data:

        fig = plt.figure()

        counter = 0
        for classifier in classifiers:
            counter += 1
            cur_ax = fig.add_subplot(2, 2, counter)

            one_plot(data[data_size][classifier], classifier, cur_ax)

        save_plot(data_size, classifiers, data_name)
Example #5
0
def setup(test=False, order=1, learn_options=None, data_file=None):

    if 'num_proc' not in learn_options.keys():
        learn_options['num_proc'] = None
    if 'num_thread_per_proc' not in learn_options.keys():
        learn_options['num_thread_per_proc'] = None

    num_proc = local_multiprocessing.configure(
        TEST=test,
        num_proc=learn_options["num_proc"],
        num_thread_per_proc=learn_options["num_thread_per_proc"])
    learn_options["num_proc"] = num_proc

    learn_options[
        "order"] = order  # gets used many places in code, not just here

    if "cv" not in learn_options.keys():
        # if no CV preference is specified, use leave-one-gene-out
        learn_options["cv"] = "gene"

    if "normalize_features" not in learn_options.keys():
        # if no CV preference is specified, use leave-one-gene-out
        learn_options["normalize_features"] = True

    if "weighted" not in learn_options.keys():
        learn_options['weighted'] = None

    if "all pairs" not in learn_options.keys():
        learn_options["all pairs"] = False

    if "include_known_pairs" not in learn_options.keys():
        learn_options["include_known_pairs"] = False

    if "include_gene_guide_feature" not in learn_options.keys():
        learn_options[
            "include_gene_guide_feature"] = 0  #used as window size, so 0 is none

    #these should default to true to match experiments before they were options:
    if "gc_features" not in learn_options.keys():
        learn_options["gc_features"] = True
    if "nuc_features" not in learn_options.keys():
        learn_options["nuc_features"] = True

    if 'train_genes' not in learn_options.keys():
        learn_options["train_genes"] = None
    if 'test_genes' not in learn_options.keys():
        learn_options["test_genes"] = None

    if "num_proc" not in learn_options:
        learn_options["num_proc"] = None
    if "num_thread_per_proc" not in learn_options:
        learn_options["num_thread_per_proc"] = None

    if 'seed' not in learn_options:
        learn_options['seed'] = 1

    if "flipV1target" not in learn_options:
        learn_options["flipV1target"] = False

    if 'num_genes_remove_train' not in learn_options:
        learn_options['num_genes_remove_train'] = None

    if "include_microhomology" not in learn_options:
        learn_options["include_microhomology"] = False

    assert "testing_non_binary_target_name" in learn_options.keys(
    ), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
    if learn_options["testing_non_binary_target_name"] not in [
            'ranks', 'raw', 'thrs'
    ]:
        raise Exception(
            'learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]'
        )

    Xdf, Y, gene_position, target_genes = load_data.from_file(
        data_file, learn_options)
    learn_options['all_genes'] = target_genes

    if test:
        learn_options["order"] = 1

    feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position)
    np.random.seed(learn_options['seed'])

    return Y, feature_sets, target_genes, learn_options, num_proc
def setup(test=False, order=1, learn_options=None, data_file=None):

    if 'num_proc' not in learn_options.keys():
        learn_options['num_proc'] = None
    if 'num_thread_per_proc' not in learn_options.keys():
        learn_options['num_thread_per_proc'] = None

    num_proc = local_multiprocessing.configure(TEST=test, num_proc=learn_options["num_proc"], 
                                                num_thread_per_proc=learn_options["num_thread_per_proc"])
    learn_options["num_proc"] = num_proc

    learn_options["order"] = order  # gets used many places in code, not just here

    if "cv" not in learn_options.keys():
        # if no CV preference is specified, use leave-one-gene-out
        learn_options["cv"] = "gene"

    if "normalize_features" not in learn_options.keys():
        # if no CV preference is specified, use leave-one-gene-out
        learn_options["normalize_features"] = True

    if "weighted" not in learn_options.keys():
        learn_options['weighted'] = None

    if "all pairs" not in learn_options.keys():
        learn_options["all pairs"] = False

    if "include_known_pairs" not in learn_options.keys():
        learn_options["include_known_pairs"] = False

    if "include_gene_guide_feature" not in learn_options.keys():
        learn_options["include_gene_guide_feature"] = 0 #used as window size, so 0 is none

    #these should default to true to match experiments before they were options:
    if "gc_features" not in learn_options.keys():
        learn_options["gc_features"] = True
    if "nuc_features" not in learn_options.keys():
        learn_options["nuc_features"] = True

    if 'train_genes' not in learn_options.keys():
        learn_options["train_genes"] = None
    if 'test_genes' not in learn_options.keys():
        learn_options["test_genes"] = None

    if "num_proc" not in learn_options:
        learn_options["num_proc"] = None
    if "num_thread_per_proc" not in learn_options:
        learn_options["num_thread_per_proc"] = None

    if 'seed' not in learn_options:
        learn_options['seed'] = 1

    if "flipV1target" not in learn_options:
        learn_options["flipV1target"] = False

    if 'num_genes_remove_train' not in learn_options:
        learn_options['num_genes_remove_train'] = None

    if "include_microhomology" not in learn_options:
        learn_options["include_microhomology"] = False


    assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
    if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']:
        raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]')

    Xdf, Y, gene_position, target_genes = load_data.from_file(data_file, learn_options)
    learn_options['all_genes'] = target_genes

    if test:
        learn_options["order"] = 1
 
    feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position)
    np.random.seed(learn_options['seed'])

    return Y, feature_sets, target_genes, learn_options, num_proc
Example #7
0
    # Deterministic splitting for comparability.  and test data the even numbered samples.

    # Training data is odd numbered samples.
    X_train = X[::2]
    y_train = y[::2]

    # Test data is odd numbered samples.
    X_test = X[1::2]
    y_test = y[1::2]

    # Sanity checks.
    assert np.shape(np.vstack([X_train, X_test])) == np.shape(X)
    assert np.isclose(len(y), len(np.append(y_train, y_test)))

    return X_train, X_test, y_train, y_test


if __name__ == "__main__":

    from load_data import from_file

    X, y = from_file("ds-1.txt")
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print(X[:10])
    print()
    print(X_train[:5])
    print()
    print(X_test[:5])
Example #8
0
def get_data():
    data = collections.OrderedDict()
    for cur_name in data_name:
        data[cur_name] = load_data.from_file(cur_name, return_avg_time=False)

    return data