def cluster_samples(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] components = map(lambda idx: idx - 1, args.components) selected = projected[:, components] features = read_features(workdir) _, labels, inertia = k_means(selected, args.n_clusters, n_jobs=-2) fig_flname = os.path.join(analysis_dir, "clusters_%s.tsv" % args.n_clusters) clusters = defaultdict(list) for name, cluster in zip(features.sample_labels, labels): clusters[cluster].append(name) with open(fig_flname, "w") as fl: for cluster, samples in clusters.iteritems(): fl.write(str(cluster)) fl.write(",") fl.write(",".join(samples)) fl.write("\n")
def analyze_rankings(args): workdir = args.workdir figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) all_snps = read_snps(workdir, args.method) n_models, common_feature_percentages = similarity_curves( args.thresholds, all_snps, project_summary.filtered_positions) analysis_flname = os.path.join( analysis_dir, "snp_ranking_overlaps_" + args.method + ".tsv") write_similarity_curves(analysis_flname, args.thresholds, n_models, common_feature_percentages) flname_base = os.path.join(figures_dir, "snp_ranking_overlaps_" + args.method) plot_similarity_curves(flname_base, args.thresholds, n_models, common_feature_percentages)
def analyze_rankings(args): workdir = args["workdir"] figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) all_snps = read_rf_snps(workdir) n_models, common_feature_percentages = similarity_curves( args["thresholds"], all_snps, project_summary.filtered_positions) analysis_flname = os.path.join(analysis_dir, "snp_ranking_overlaps_rf.tsv") write_similarity_curves(analysis_flname, args["thresholds"], n_models, common_feature_percentages) flname_base = os.path.join(figures_dir, "snp_ranking_overlaps_rf") plot_similarity_curves(flname_base, args["thresholds"], n_models, common_feature_percentages) n_models, common_feature_counts, snp1_feature_counts, \ snp2_feature_counts = sampled_snps_curves(all_snps) flname_base = os.path.join(figures_dir, "snp_counts") plot_sampled_snps_curves(flname_base, n_models, common_feature_counts, snp1_feature_counts, snp2_feature_counts)
def output_coordinates(args): workdir = args.workdir project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] selected = projected[:, map(lambda idx: idx - 1, args.selected_components)] features = read_features(workdir) with open(args.output_fl, "w") as fl: headers = ["sample", "population_index", "population_name"] headers.extend(map(str, args.selected_components)) fl.write("\t".join(headers)) fl.write("\n") for i in xrange(len(features.sample_labels)): sample = features.sample_labels[i] pop_idx = features.class_labels[i] pop_name = project_summary.population_names[pop_idx] line = [sample, str(pop_idx), pop_name] line.extend(map(str, selected[i, :])) fl.write("\t".join(line)) fl.write("\n")
def sweep_clusters(args): workdir = args.workdir figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] components = map(lambda idx: idx - 1, args.components) selected = projected[:, components] features = read_features(workdir) inertia_values = [] for k in args.n_clusters: print "Clustering with %s clusters" % k _, _, inertia = k_means(selected, k, n_jobs=-2) inertia_values.append(inertia) plt.plot(args.n_clusters, inertia_values, "k.-") plt.xlabel("Number of Clusters", fontsize=16) plt.ylabel("Inertia", fontsize=16) fig_flname = os.path.join(figures_dir, "cluster_inertia") for dim in args.components: fig_flname += "_%s" % dim fig_flname += ".png" plt.savefig(fig_flname, DPI=300)
def snp_association_tests(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projections = model[PROJECTION_KEY] data_model = read_features(workdir) n_iter = estimate_lr_iter(len(data_model.class_labels)) # we set the intercept to the class ratios in the lr test function lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=False) n_pcs = projections.shape[0] for pc in args.components: flname = os.path.join(analysis_dir, "snp_pc_%s_association_tests.tsv" % pc) with open(flname, "w") as fl: next_output = 1 for i, pair in enumerate(data_model.snp_feature_map.iteritems()): snp_label, feature_idx = pair chrom, pos = snp_label snp_features = data_model.feature_matrix[:, feature_idx] triplet = generate_training_set(snp_features, projections[:, pc - 1]) n_copies, class_labels, imputed_projections = triplet imputed_projections = imputed_projections.reshape(-1, 1) # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size try: p_value = likelihood_ratio_test(imputed_projections, class_labels, lr, g_scaling_factor=1.0 / n_copies) # in case of underflow or overflow in a badly-behaving model except ValueError: p_value = 1.0 if i == next_output: print i, "SNP", snp_label, "and PC", pc, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(p_value)])) fl.write("\n")
def plot_projections(args): if len(args.pairs) % 2 != 0: print "Error: PCs must be provided in pairs of 2" sys.exit(1) workdir = args.workdir figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projected = model[PROJECTION_KEY] features = read_features(workdir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) all_labels = set(features.class_labels) labels = np.array(features.class_labels, dtype=np.int32) populations = [] for l in all_labels: pop = labels == l pop_name = project_summary.population_names[l] populations.append((pop, pop_name)) for p1, p2 in pairwise(args.pairs): fig_flname = os.path.join( figures_dir, "pca_projection_%s_%s.png" % (str(p1), str(p2))) plt.clf() plt.grid(True) colors = ["m", "c", "k", "r", "g", "b"] markers = ["o"] * len(colors) + \ ["s"] * len(colors) + \ ["+"] * len(colors) for idx, (pop_idx, pop_name) in enumerate(populations): plt.scatter(projected[pop_idx, p1 - 1], projected[pop_idx, p2 - 1], color=colors[idx % len(colors)], marker=markers[idx % len(markers)], edgecolor="k", alpha=0.7, label=pop_name) plt.xlabel("Principal Component %s" % p1, fontsize=16) plt.ylabel("Principal Component %s" % p2, fontsize=16) if len(all_labels) > 1: plt.legend() plt.savefig(fig_flname, DPI=300)
def snp_linreg_association_tests(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projections = model[PROJECTION_KEY] data_model = read_features(workdir) n_pcs = projections.shape[0] for pc in args.components: flname = os.path.join(analysis_dir, "snp_pc_%s_linreg_assoc_tests.tsv" % pc) with open(flname, "w") as fl: next_output = 1 for i, pair in enumerate(data_model.snp_feature_map.iteritems()): snp_label, feature_idx = pair chrom, pos = snp_label snp_features = data_model.feature_matrix[:, feature_idx] # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size triplet = snp_linreg_pvalues(snp_features, projections[:, pc - 1]) snp_p_value, gt_ttest_pvalues, gt_normality_pvalues, gt_pred_ys = triplet if i == next_output: print i, "SNP", snp_label, "and PC", pc, "has p-value", snp_p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(snp_p_value)])) for j in xrange(3): fl.write("\t") fl.write(str(gt_ttest_pvalues[j])) fl.write("\t") fl.write(str(gt_normality_pvalues[j])) fl.write("\t") fl.write(str(gt_pred_ys[j])) fl.write("\n")
def read_snps(basedir, method): model_dir = os.path.join(basedir, "models", "lr-" + method) if not os.path.exists(model_dir): return dict() model_dirs = glob.glob(os.path.join(model_dir, "*")) models = defaultdict(list) for model_dir in model_dirs: model_flnames = glob.glob(os.path.join(model_dir, "*")) for flname in model_flnames: snps = deserialize(flname) n_models = int(os.path.basename(os.path.dirname(flname))) models[n_models].append(snps) return models
def pop_association_tests(args): workdir = args.workdir analysis_dir = os.path.join(workdir, "analysis") if not os.path.exists(analysis_dir): os.makedirs(analysis_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) projections = model[PROJECTION_KEY] data_model = read_features(workdir) n_iter = estimate_lr_iter(len(data_model.sample_labels)) # we set the intercept to the class ratios in the lr test function lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter * 10., fit_intercept=True) pvalues_fl = os.path.join(analysis_dir, "population_pca_association_tests.tsv") class_labels = np.array(data_model.class_labels) with open(pvalues_fl, "w") as fl: for i in xrange(projections.shape[1]): features = projections[:, i].reshape(-1, 1) p_value = likelihood_ratio_test(features, class_labels, lr, set_intercept=False) lr.fit(features, class_labels) pred_labels = lr.predict(features) acc = 100. * accuracy_score(class_labels, pred_labels) cm = confusion_matrix(class_labels, pred_labels) print(i + 1), p_value, acc print cm print fl.write("%s\t%s\t%s\n" % (i + 1, p_value, acc))
def analyze_weights(args): workdir = args.workdir figures_dir = os.path.join(workdir, "figures") if not os.path.exists(figures_dir): os.makedirs(figures_dir) project_summary = deserialize(os.path.join(workdir, PROJECT_SUMMARY_FLNAME)) model_fl = os.path.join(workdir, "models", "pca.pkl") model = joblib.load(model_fl) pca = model[MODEL_KEY] for i, w in enumerate(args.weights): plt.clf() seaborn.distplot(w * pca.components_[i, :], kde=False) plt.xlabel("Feature Weights", fontsize=16) plt.ylabel("Count(Features)", fontsize=16) plot_fl = os.path.join(figures_dir, "pca_feature_weights_pc%s.png" % (i + 1)) plt.savefig(plot_fl, DPI=300)
return parser.parse_args() if __name__ == "__main__": args = parseargs() if not os.path.exists(args.workdir): print "Work directory '%s' does not exist." % args.workdir sys.exit(1) stats_dir = os.path.join(args.workdir, "statistics") if not os.path.exists(stats_dir): os.makedirs(stats_dir) project_summary = deserialize( os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME)) features = read_features(args.workdir) if args.mode == "pairwise": if project_summary.feature_encoding != "categories": print "Pairwise Cramer's V only works with the 'categories' feature encoding." sys.exit(1) set_one = None if args.subset_1: with open(args.subset_1) as fl: set_one = set() for ln in fl: cols = ln.strip().split() set_one.add((cols[0], cols[1]))
def run_likelihood_ratio_tests(args): if not os.path.exists(args.workdir): print "Work directory '%s' does not exist." % args.workdir sys.exit(1) stats_dir = os.path.join(args.workdir, OUTPUT_DIR) if not os.path.exists(stats_dir): os.makedirs(stats_dir) project_summary = deserialize( os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME)) data_model = read_features(args.workdir) genotypes = data_model.feature_matrix n_iter = estimate_lr_iter(len(data_model.class_labels)) lr = SGDClassifier(penalty="l2", loss="log", n_iter=n_iter, fit_intercept=False) testing_variables = np.array(data_model.class_labels).reshape(-1, 1) null_variables = None if args.variables_fl: selected_sample_ids, null_variables = parse_variables_file( args.variables_fl) selected_indices = select_samples(data_model, selected_sample_ids) # select subset and re-order genotypes = genotypes[selected_indices, :] testing_variables = testing_variables[selected_indices, :] N_COPIES = 3 class_labels = None testing_features, null_features = prepare_model_variables( N_COPIES, testing_variables, null_variables) with open(os.path.join(stats_dir, OUTPUT_FLNAME), "w") as fl: next_output = 1 for i, pair in enumerate(data_model.snp_feature_map.iteritems()): pos_label, feature_idx = pair chrom, pos = pos_label pos_genotypes = genotypes[:, feature_idx] class_labels = prepare_class_labels(N_COPIES, pos_genotypes, class_labels) # since we make multiple copies of the original samples, # we need to scale the log loss so that it is correct for # the original sample size p_value = likelihood_ratio_test(testing_features, class_labels, lr, features_null=null_features, g_scaling_factor=1.0 / N_COPIES) if i == next_output: print i, "Position", pos_label, "has p-value", p_value next_output *= 2 fl.write("\t".join([chrom, pos, str(p_value)])) fl.write("\n")