def snp_association_tests(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projections = model[PROJECTION_KEY] data_model = read_features(workdir) n_iter = estimate_lr_iter(len(data_model.class_labels)) # we set the intercept to the class ratios in the lr test function lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=False) n_pcs = projections.shape[0] for pc in args.components: flname = os.path.join(analysis_dir, "snp_pc_%s_association_tests.tsv" % pc) with open(flname, "w") as fl: next_output = 1 for i, pair in enumerate(data_model.snp_feature_map.iteritems()): snp_label, feature_idx = pair chrom, pos = snp_label snp_features = data_model.feature_matrix[:, feature_idx] triplet = generate_training_set(snp_features, projections[:, pc - 1]) n_copies, class_labels, imputed_projections = triplet imputed_projections = imputed_projections.reshape(-1, 1) # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size try: p_value = likelihood_ratio_test(imputed_projections, class_labels, lr, g_scaling_factor=1.0 / n_copies) # in case of underflow or overflow in a badly-behaving model except ValueError: p_value = 1.0 if i == next_output: print i, "SNP", snp_label, "and PC", pc, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(p_value)])) fl.write("\n")
def run_lrtest_gt_dep(data_model, project_summary, args, stats_dir, class_labels): genotypes = data_model.feature_matrix n_iter = estimate_lr_iter(len(class_labels)) lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=False) encoder = OneHotEncoder(sparse=False) flname = "snp_lrtests_gt.tsv" n_pops = len(set(class_labels)) with open(os.path.join(stats_dir, flname), "w") as fl: next_output = 1 headers = ["chrom", "pos", "snp_p_value"] fl.write("\t".join(headers)) fl.write("\n") for i, pair in enumerate(data_model.snp_feature_map.iteritems()): pos_label, feature_idx = pair chrom, pos = pos_label N_COPIES = 3 pops, snp_genotypes = upsample_features(class_labels, genotypes[:, feature_idx]) # as we're using the genotypes as the labels, # they need to be one dimensional snp_genotypes = snp_genotypes.argmax(axis=1) p_value = 1.0 if len(set(snp_genotypes)) > 1: # likewise, the pops need to 2D and one-hot encoded pops = pops.reshape(-1, 1) pops = encoder.fit_transform(pops) # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size p_value = likelihood_ratio_test(pops, snp_genotypes, lr, g_scaling_factor=1.0 / N_COPIES) if i == next_output: print i, "Position", pos_label, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, "%.2E" % p_value])) fl.write("\n")
def run_likelihood_ratio_tests(features, project_summary, args, stats_dir): if len(set(features.class_labels)) != 2: raise ValueError, "LR Test currently only supports 2 populations." n_iter = estimate_lr_iter(len(features.class_labels)) fit_intercept = False if args.intercept == "free-parameter": fit_intercept = True lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=fit_intercept) flname = "snp_likelihood_ratio_tests.tsv" with open(os.path.join(stats_dir, flname), "w") as fl: next_output = 1 for i, pair in enumerate(features.snp_feature_map.iteritems()): snp_label, feature_idx = pair chrom, pos = snp_label labels = np.array(features.class_labels) snp_features = features.feature_matrix[:, feature_idx] if args.training_set == "adjusted": training_labels, training_features = generate_training_set( project_summary.feature_encoding, labels, snp_features) else: training_labels = labels training_features = snp_features set_intercept_to_class_prob = False if args.intercept == "class-probabilities": set_intercept_to_class_prob = True p_value = likelihood_ratio_test( (training_features, snp_features), (training_labels, labels), lr, set_intercept=set_intercept_to_class_prob) if i == next_output: print i, "SNP", snp_label, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(p_value)])) fl.write("\n")
def pop_association_tests(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projections = model[PROJECTION_KEY] data_model = read_features(workdir) n_iter = estimate_lr_iter(len(data_model.sample_labels)) # we set the intercept to the class ratios in the lr test function lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter * 10., fit_intercept=True) pvalues_fl = os.path.join(analysis_dir, "population_pca_association_tests.tsv") class_labels = np.array(data_model.class_labels) with open(pvalues_fl, "w") as fl: for i in xrange(projections.shape[1]): features = projections[:, i].reshape(-1, 1) p_value = likelihood_ratio_test(features, class_labels, lr, set_intercept=False) lr.fit(features, class_labels) pred_labels = lr.predict(features) acc = 100. * accuracy_score(class_labels, pred_labels) cm = confusion_matrix(class_labels, pred_labels) print(i + 1), p_value, acc print cm print fl.write("%s\t%s\t%s\n" % (i + 1, p_value, acc))
def run_likelihood_ratio_tests(args): if not os.path.exists(args.workdir): print "Work directory '%s' does not exist." % args.workdir sys.exit(1) stats_dir = os.path.join(args.workdir, OUTPUT_DIR) if not os.path.exists(stats_dir): os.makedirs(stats_dir) project_summary = deserialize( os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME)) data_model = read_features(args.workdir) genotypes = data_model.feature_matrix n_iter = estimate_lr_iter(len(data_model.class_labels)) lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=False) testing_variables = np.array(data_model.class_labels).reshape(-1, 1) null_variables = None if args.variables_fl: selected_sample_ids, null_variables = parse_variables_file( args.variables_fl) selected_indices = select_samples(data_model, selected_sample_ids) # select subset and re-order genotypes = genotypes[selected_indices, :] testing_variables = testing_variables[selected_indices, :] N_COPIES = 3 class_labels = None testing_features, null_features = prepare_model_variables( N_COPIES, testing_variables, null_variables) with open(os.path.join(stats_dir, OUTPUT_FLNAME), "w") as fl: next_output = 1 for i, pair in enumerate(data_model.snp_feature_map.iteritems()): pos_label, feature_idx = pair chrom, pos = pos_label pos_genotypes = genotypes[:, feature_idx] class_labels = prepare_class_labels(N_COPIES, pos_genotypes, class_labels) # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size p_value = likelihood_ratio_test(testing_features, class_labels, lr, features_null=null_features, g_scaling_factor=1.0 / N_COPIES) if i == next_output: print i, "Position", pos_label, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(p_value)])) fl.write("\n")
def run_lrtest_pop_dep(features, project_summary, args, stats_dir, class_labels): n_iter = estimate_lr_iter(len(class_labels)) class_labels = np.array(class_labels) fit_intercept = False if args.intercept == "free-parameter": fit_intercept = True lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=fit_intercept) flname = "snp_lrtests_pop.tsv" with open(os.path.join(stats_dir, flname), "w") as fl: next_output = 1 for i, pair in enumerate(features.snp_feature_map.iteritems()): snp_label, feature_idx = pair chrom, pos = snp_label snp_features = features.feature_matrix[:, feature_idx] if args.adjustment != "none": upsampled_labels, snp_features = upsample_features( class_labels, snp_features) else: upsampled_labels = class_labels # remove columns that are all zeros since these # aren't true degrees of freedom. prevents # under-estimating significance if args.remove_empty_columns: mask = np.all(snp_features == 0., axis=0) snp_features = snp_features[:, ~mask] set_intercept_to_class_prob = False if args.intercept == "class-probabilities": set_intercept_to_class_prob = True scaling_factor = 1.0 if args.adjustment == "training-set": snp_features = (snp_features, features.feature_matrix[:, feature_idx]) upsampled_labels = (upsampled_labels, class_labels) elif args.adjustment == "scaling-factor": scaling_factor = 1.0 / 3.0 p_value = likelihood_ratio_test( snp_features, upsampled_labels, lr, set_intercept=set_intercept_to_class_prob, g_scaling_factor=scaling_factor) if i == next_output: print i, "SNP", snp_label, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, "%.2E" % p_value])) fl.write("\n")