def compute_bootstrap(data_name): data = load_data.from_file(data_name, return_avg_time=True) classifier_to_performance = data[0][6919] # just for comparison: closed_form_mean_var, avg_times = compute_sample_maxes( data_name, True, True) c_to_means = {} c_to_vars = {} for num_samples in [50000]: for classifier in classifier_to_performance: bootstrap_means = [] bootstrap_vars = [] for n in range(len(classifier_to_performance[classifier])): cur_mean, cur_std = draw_bootstrap_samples( classifier_to_performance[classifier], n + 1, num_samples) bootstrap_means.append(cur_mean) bootstrap_vars.append(cur_std) c_to_means[classifier] = bootstrap_means c_to_vars[classifier] = bootstrap_vars #print(closed_form[0][6919][classifier]) #print(bootstrap_vals) #diffs = [closed_form[0][6919][classifier][i] - bootstrap_means[i] for i in range(len(bootstrap_means))] #print(sum(abs(np.asarray(diffs)))) #print(diffs) import pdb pdb.set_trace()
def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True, length_audit=True): num_proc = shared_setup(learn_options, order, test) assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error" if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']: raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]') Xdf, Y, gene_position, target_genes = load_data.from_file(data_file, learn_options) learn_options['all_genes'] = target_genes if test: learn_options["order"] = 1 if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True: print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)" for i in range(Xdf.shape[0]): Xdf['30mer'].iloc[i] = util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"]) # to_keep = Xdf['30mer'].isnull() == False # Xdf = Xdf[to_keep] # gene_position = gene_position[to_keep] # Y = Y[to_keep] Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide if learn_options.has_key('left_right_guide_ind') and learn_options['left_right_guide_ind'] is not None: seq_start, seq_end, expected_length = learn_options['left_right_guide_ind'] Xdf['30mer'] = Xdf['30mer'].apply(lambda seq: seq[seq_start:seq_end]) feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit=pam_audit, length_audit=length_audit) np.random.seed(learn_options['seed']) return Y, feature_sets, target_genes, learn_options, num_proc
def compute_sample_maxes(data_name="sst2", with_replacement=True, return_avg_time=False): data = load_data.from_file(data_name, return_avg_time=return_avg_time) if return_avg_time: avg_time = data[1] data = data[0] sample_maxes = {} for data_size in data: if data_size not in sample_maxes: sample_maxes[data_size] = {} for classifier in data[data_size]: sample_maxes[data_size][classifier] = sample_max( data[data_size][classifier], with_replacement) #import pdb; pdb.set_trace() if return_avg_time: return sample_maxes, avg_time else: return sample_maxes
def main(): data_name = "sst5" #["ag_news", "imdb", "hatespeech_10k"]: unformatted_data = load_data.from_file(data_name, True)[1] data = format_data(unformatted_data) import pdb pdb.set_trace() classifiers = expected_max_cond_n.get_classifiers(data) for data_size in data: fig = plt.figure() counter = 0 for classifier in classifiers: counter += 1 cur_ax = fig.add_subplot(2, 2, counter) one_plot(data[data_size][classifier], classifier, cur_ax) save_plot(data_size, classifiers, data_name)
def setup(test=False, order=1, learn_options=None, data_file=None): if 'num_proc' not in learn_options.keys(): learn_options['num_proc'] = None if 'num_thread_per_proc' not in learn_options.keys(): learn_options['num_thread_per_proc'] = None num_proc = local_multiprocessing.configure( TEST=test, num_proc=learn_options["num_proc"], num_thread_per_proc=learn_options["num_thread_per_proc"]) learn_options["num_proc"] = num_proc learn_options[ "order"] = order # gets used many places in code, not just here if "cv" not in learn_options.keys(): # if no CV preference is specified, use leave-one-gene-out learn_options["cv"] = "gene" if "normalize_features" not in learn_options.keys(): # if no CV preference is specified, use leave-one-gene-out learn_options["normalize_features"] = True if "weighted" not in learn_options.keys(): learn_options['weighted'] = None if "all pairs" not in learn_options.keys(): learn_options["all pairs"] = False if "include_known_pairs" not in learn_options.keys(): learn_options["include_known_pairs"] = False if "include_gene_guide_feature" not in learn_options.keys(): learn_options[ "include_gene_guide_feature"] = 0 #used as window size, so 0 is none #these should default to true to match experiments before they were options: if "gc_features" not in learn_options.keys(): learn_options["gc_features"] = True if "nuc_features" not in learn_options.keys(): learn_options["nuc_features"] = True if 'train_genes' not in learn_options.keys(): learn_options["train_genes"] = None if 'test_genes' not in learn_options.keys(): learn_options["test_genes"] = None if "num_proc" not in learn_options: learn_options["num_proc"] = None if "num_thread_per_proc" not in learn_options: learn_options["num_thread_per_proc"] = None if 'seed' not in learn_options: learn_options['seed'] = 1 if "flipV1target" not in learn_options: learn_options["flipV1target"] = False if 'num_genes_remove_train' not in learn_options: learn_options['num_genes_remove_train'] = None if "include_microhomology" not in learn_options: learn_options["include_microhomology"] = False assert "testing_non_binary_target_name" in learn_options.keys( ), "need this in order to get metrics, though used to be not needed, so you may newly see this error" if learn_options["testing_non_binary_target_name"] not in [ 'ranks', 'raw', 'thrs' ]: raise Exception( 'learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]' ) Xdf, Y, gene_position, target_genes = load_data.from_file( data_file, learn_options) learn_options['all_genes'] = target_genes if test: learn_options["order"] = 1 feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position) np.random.seed(learn_options['seed']) return Y, feature_sets, target_genes, learn_options, num_proc
def setup(test=False, order=1, learn_options=None, data_file=None): if 'num_proc' not in learn_options.keys(): learn_options['num_proc'] = None if 'num_thread_per_proc' not in learn_options.keys(): learn_options['num_thread_per_proc'] = None num_proc = local_multiprocessing.configure(TEST=test, num_proc=learn_options["num_proc"], num_thread_per_proc=learn_options["num_thread_per_proc"]) learn_options["num_proc"] = num_proc learn_options["order"] = order # gets used many places in code, not just here if "cv" not in learn_options.keys(): # if no CV preference is specified, use leave-one-gene-out learn_options["cv"] = "gene" if "normalize_features" not in learn_options.keys(): # if no CV preference is specified, use leave-one-gene-out learn_options["normalize_features"] = True if "weighted" not in learn_options.keys(): learn_options['weighted'] = None if "all pairs" not in learn_options.keys(): learn_options["all pairs"] = False if "include_known_pairs" not in learn_options.keys(): learn_options["include_known_pairs"] = False if "include_gene_guide_feature" not in learn_options.keys(): learn_options["include_gene_guide_feature"] = 0 #used as window size, so 0 is none #these should default to true to match experiments before they were options: if "gc_features" not in learn_options.keys(): learn_options["gc_features"] = True if "nuc_features" not in learn_options.keys(): learn_options["nuc_features"] = True if 'train_genes' not in learn_options.keys(): learn_options["train_genes"] = None if 'test_genes' not in learn_options.keys(): learn_options["test_genes"] = None if "num_proc" not in learn_options: learn_options["num_proc"] = None if "num_thread_per_proc" not in learn_options: learn_options["num_thread_per_proc"] = None if 'seed' not in learn_options: learn_options['seed'] = 1 if "flipV1target" not in learn_options: learn_options["flipV1target"] = False if 'num_genes_remove_train' not in learn_options: learn_options['num_genes_remove_train'] = None if "include_microhomology" not in learn_options: learn_options["include_microhomology"] = False assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error" if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']: raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]') Xdf, Y, gene_position, target_genes = load_data.from_file(data_file, learn_options) learn_options['all_genes'] = target_genes if test: learn_options["order"] = 1 feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position) np.random.seed(learn_options['seed']) return Y, feature_sets, target_genes, learn_options, num_proc
# Deterministic splitting for comparability. and test data the even numbered samples. # Training data is odd numbered samples. X_train = X[::2] y_train = y[::2] # Test data is odd numbered samples. X_test = X[1::2] y_test = y[1::2] # Sanity checks. assert np.shape(np.vstack([X_train, X_test])) == np.shape(X) assert np.isclose(len(y), len(np.append(y_train, y_test))) return X_train, X_test, y_train, y_test if __name__ == "__main__": from load_data import from_file X, y = from_file("ds-1.txt") X_train, X_test, y_train, y_test = train_test_split(X, y) print(X[:10]) print() print(X_train[:5]) print() print(X_test[:5])
def get_data(): data = collections.OrderedDict() for cur_name in data_name: data[cur_name] = load_data.from_file(cur_name, return_avg_time=False) return data