def train(args, sample_populations):
    workdir = args.workdir

    features = read_features(workdir)
    class_labels = make_labels(features.sample_labels, sample_populations)

    print "Training Ensemble 1"
    lr1 = LogisticRegressionEnsemble(args.n_models,
                                     args.method,
                                     args.batch_size,
                                     bagging=args.bagging)
    feature_importances = lr1.feature_importances(features.feature_matrix,
                                                  class_labels)
    snp_importances = features.rank_snps(feature_importances)
    write_snps(workdir, snp_importances, args.method, args.n_models, "1")

    print "Training ensemble 2"
    lr2 = LogisticRegressionEnsemble(args.n_models,
                                     args.method,
                                     args.batch_size,
                                     bagging=args.bagging)
    feature_importances = lr2.feature_importances(features.feature_matrix,
                                                  class_labels)
    snp_importances = features.rank_snps(feature_importances)
    write_snps(workdir, snp_importances, args.method, args.n_models, "2")
Exemple #2
0
def cluster_samples(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]
    components = map(lambda idx: idx - 1, args.components)
    selected = projected[:, components]

    features = read_features(workdir)

    _, labels, inertia = k_means(selected, args.n_clusters, n_jobs=-2)

    fig_flname = os.path.join(analysis_dir,
                              "clusters_%s.tsv" % args.n_clusters)

    clusters = defaultdict(list)
    for name, cluster in zip(features.sample_labels, labels):
        clusters[cluster].append(name)

    with open(fig_flname, "w") as fl:
        for cluster, samples in clusters.iteritems():
            fl.write(str(cluster))
            fl.write(",")
            fl.write(",".join(samples))
            fl.write("\n")
Exemple #3
0
def sweep_clusters(args):
    workdir = args.workdir

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]
    components = map(lambda idx: idx - 1, args.components)
    selected = projected[:, components]

    features = read_features(workdir)

    inertia_values = []
    for k in args.n_clusters:
        print "Clustering with %s clusters" % k
        _, _, inertia = k_means(selected, k, n_jobs=-2)
        inertia_values.append(inertia)

    plt.plot(args.n_clusters, inertia_values, "k.-")
    plt.xlabel("Number of Clusters", fontsize=16)
    plt.ylabel("Inertia", fontsize=16)

    fig_flname = os.path.join(figures_dir, "cluster_inertia")
    for dim in args.components:
        fig_flname += "_%s" % dim
    fig_flname += ".png"

    plt.savefig(fig_flname, DPI=300)
Exemple #4
0
def output_coordinates(args):
    workdir = args.workdir

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]
    selected = projected[:, map(lambda idx: idx - 1, args.selected_components)]

    features = read_features(workdir)

    with open(args.output_fl, "w") as fl:
        headers = ["sample", "population_index", "population_name"]
        headers.extend(map(str, args.selected_components))
        fl.write("\t".join(headers))
        fl.write("\n")

        for i in xrange(len(features.sample_labels)):
            sample = features.sample_labels[i]
            pop_idx = features.class_labels[i]
            pop_name = project_summary.population_names[pop_idx]
            line = [sample, str(pop_idx), pop_name]
            line.extend(map(str, selected[i, :]))
            fl.write("\t".join(line))
            fl.write("\n")
Exemple #5
0
def output_loading_factors(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    data_model = read_features(workdir)

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    pca = model[MODEL_KEY]
    components = pca.components_
    selected = components[map(lambda idx: idx - 1, args.components), :]

    output_fl = os.path.join(analysis_dir, "pca_loading_factors.tsv")
    with open(output_fl, "w") as fl:
        header = ["chromosome", "position", "dummy variable"]
        header.extend(map(str, args.components))
        fl.write("\t".join(header))
        fl.write("\n")
        for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
            snp_label, feature_idx = pair
            chrom, pos = snp_label

            for j, idx in enumerate(feature_idx):
                features = selected[:, idx]

                fl.write("%s\t%s\t%s\t" % (chrom, pos, j))
                fl.write("\t".join(map(str, features)))
                fl.write("\n")
Exemple #6
0
def min_components_explained_variance(args):
    workdir = args.workdir

    features = read_features(workdir)

    n_components = args.init_n_components
    while True:
        print "Computing PCA with %s components" % n_components
        pca = PCA(n_components=n_components, whiten=True)
        pca.fit(features.feature_matrix)
        explained_variance_ratios = pca.explained_variance_ratio_
        sorted_ratios = np.sort(explained_variance_ratios)[::-1]
        cum_ratios = np.cumsum(sorted_ratios)
        total_explained_variance = cum_ratios[-1]
        if total_explained_variance >= args.explained_variance_threshold:
            break
        n_components *= 2

    needed_components = 0
    achieved_ev_ratio = 0.0
    for i, ev_ratio in enumerate(cum_ratios):
        if ev_ratio >= args.explained_variance_threshold:
            needed_components = i + 1
            achieved_ev_ratio = ev_ratio
            break

    print "Explained-variance threshold of %s surpassed at %s with %s components" % \
        (args.explained_variance_threshold,
         achieved_ev_ratio,
         needed_components)
Exemple #7
0
def pca(args):
    workdir = args["workdir"]

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    n_pcs = args["n_pcs"]
    if n_pcs is None:
        print "Number of pcs must be specified for PCA"
        sys.exit(1)
    
    features = read_features(workdir)
    proj, explained_variance_ratios = features.svd(n_pcs)

    plt.clf()
    plt.plot(explained_variance_ratios, "b.-")
    plt.xlabel("PC", fontsize=16)
    plt.ylabel("Explained Variance Ratio", fontsize=16)
    plt.savefig(os.path.join(figures_dir, "pca_explained_variance_ratios.png"), DPI=200)
    
    for i in xrange(n_pcs - 1):
        for j in xrange(i + 1, n_pcs):
            plt.clf()
            plt.scatter(proj[:, i], proj[:, j])
            plt.xlabel("PC " + str(i), fontsize=16)
            plt.ylabel("PC " + str(j), fontsize=16)
            plt.savefig(os.path.join(figures_dir, "pca_%s_%s.png" % (i, j)), DPI=200)
Exemple #8
0
def snp_association_tests(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projections = model[PROJECTION_KEY]

    data_model = read_features(workdir)

    n_iter = estimate_lr_iter(len(data_model.class_labels))
    # we set the intercept to the class ratios in the lr test function
    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=False)

    n_pcs = projections.shape[0]
    for pc in args.components:
        flname = os.path.join(analysis_dir,
                              "snp_pc_%s_association_tests.tsv" % pc)
        with open(flname, "w") as fl:
            next_output = 1
            for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
                snp_label, feature_idx = pair
                chrom, pos = snp_label

                snp_features = data_model.feature_matrix[:, feature_idx]
                triplet = generate_training_set(snp_features,
                                                projections[:, pc - 1])
                n_copies, class_labels, imputed_projections = triplet

                imputed_projections = imputed_projections.reshape(-1, 1)

                # since we make multiple copies of the original samples,
                # we need to scale the log loss so that it is correct for
                # the original sample size
                try:
                    p_value = likelihood_ratio_test(imputed_projections,
                                                    class_labels,
                                                    lr,
                                                    g_scaling_factor=1.0 /
                                                    n_copies)
                # in case of underflow or overflow in a badly-behaving model
                except ValueError:
                    p_value = 1.0

                if i == next_output:
                    print i, "SNP", snp_label, "and PC", pc, "has p-value", p_value
                    next_output *= 2

                fl.write("\t".join([chrom, pos, str(p_value)]))
                fl.write("\n")
Exemple #9
0
def train_model(args, sample_populations):
    workdir = args["workdir"]

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    n_trees = args["trees"]
    if n_trees is None:
        print "Number of trees must be specified for training"
        sys.exit(1)

    n_resamples = args["resamples"]
    if n_resamples is None:
        print "Number of additional samples must be specified for training"
        sys.exit(1)

    features = read_features(workdir)
    class_labels = make_labels(features.sample_labels, sample_populations)

    rf = ConstrainedBaggingRandomForest(n_trees, n_resamples,
                                        args["batch_size"])
    feature_importances, used_feature_counts, used_feature_sets = \
                                               rf.feature_importances(
                                                   features.feature_matrix,
                                                   class_labels,
                                                   statistics=args["statistics"],
                                                   interactions=args["interactions"])

    snp_importances = features.rank_snps(feature_importances)
    write_rf_snps(workdir, snp_importances, n_trees, "model1")

    if args["statistics"]:
        dense = histogram_sparse_to_dense(used_feature_counts)
        flname = os.path.join(figures_dir, "features_used_histogram_rf_%s_trees.png" \
                              % args["trees"])
        plot_feature_histogram(flname, dense)

    if args["interactions"]:
        write_interactions(workdir, n_trees, used_feature_sets)

    rf = ConstrainedBaggingRandomForest(n_trees, n_resamples,
                                        args["batch_size"])

    feature_importances, _, _ = rf.feature_importances(features.feature_matrix,
                                                       class_labels,
                                                       statistics=False)

    snp_importances = features.rank_snps(feature_importances)
    write_rf_snps(workdir, snp_importances, n_trees, "model2")
Exemple #10
0
def plot_projections(args):
    if len(args.pairs) % 2 != 0:
        print "Error: PCs must be provided in pairs of 2"
        sys.exit(1)

    workdir = args.workdir

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]

    features = read_features(workdir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    all_labels = set(features.class_labels)
    labels = np.array(features.class_labels, dtype=np.int32)
    populations = []
    for l in all_labels:
        pop = labels == l
        pop_name = project_summary.population_names[l]
        populations.append((pop, pop_name))

    for p1, p2 in pairwise(args.pairs):
        fig_flname = os.path.join(
            figures_dir, "pca_projection_%s_%s.png" % (str(p1), str(p2)))
        plt.clf()
        plt.grid(True)
        colors = ["m", "c", "k", "r", "g", "b"]
        markers = ["o"] * len(colors) + \
                  ["s"] * len(colors) + \
                  ["+"] * len(colors)
        for idx, (pop_idx, pop_name) in enumerate(populations):
            plt.scatter(projected[pop_idx, p1 - 1],
                        projected[pop_idx, p2 - 1],
                        color=colors[idx % len(colors)],
                        marker=markers[idx % len(markers)],
                        edgecolor="k",
                        alpha=0.7,
                        label=pop_name)
        plt.xlabel("Principal Component %s" % p1, fontsize=16)
        plt.ylabel("Principal Component %s" % p2, fontsize=16)
        if len(all_labels) > 1:
            plt.legend()
        plt.savefig(fig_flname, DPI=300)
Exemple #11
0
def snp_linreg_association_tests(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projections = model[PROJECTION_KEY]

    data_model = read_features(workdir)

    n_pcs = projections.shape[0]
    for pc in args.components:
        flname = os.path.join(analysis_dir,
                              "snp_pc_%s_linreg_assoc_tests.tsv" % pc)
        with open(flname, "w") as fl:
            next_output = 1
            for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
                snp_label, feature_idx = pair
                chrom, pos = snp_label

                snp_features = data_model.feature_matrix[:, feature_idx]

                # since we make multiple copies of the original samples,
                # we need to scale the log loss so that it is correct for
                # the original sample size
                triplet = snp_linreg_pvalues(snp_features, projections[:,
                                                                       pc - 1])
                snp_p_value, gt_ttest_pvalues, gt_normality_pvalues, gt_pred_ys = triplet

                if i == next_output:
                    print i, "SNP", snp_label, "and PC", pc, "has p-value", snp_p_value
                    next_output *= 2

                fl.write("\t".join([chrom, pos, str(snp_p_value)]))
                for j in xrange(3):
                    fl.write("\t")
                    fl.write(str(gt_ttest_pvalues[j]))
                    fl.write("\t")
                    fl.write(str(gt_normality_pvalues[j]))
                    fl.write("\t")
                    fl.write(str(gt_pred_ys[j]))
                fl.write("\n")
Exemple #12
0
def pop_association_tests(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projections = model[PROJECTION_KEY]

    data_model = read_features(workdir)

    n_iter = estimate_lr_iter(len(data_model.sample_labels))
    # we set the intercept to the class ratios in the lr test function
    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter * 10.,
                       fit_intercept=True)

    pvalues_fl = os.path.join(analysis_dir,
                              "population_pca_association_tests.tsv")
    class_labels = np.array(data_model.class_labels)
    with open(pvalues_fl, "w") as fl:
        for i in xrange(projections.shape[1]):
            features = projections[:, i].reshape(-1, 1)

            p_value = likelihood_ratio_test(features,
                                            class_labels,
                                            lr,
                                            set_intercept=False)

            lr.fit(features, class_labels)
            pred_labels = lr.predict(features)
            acc = 100. * accuracy_score(class_labels, pred_labels)

            cm = confusion_matrix(class_labels, pred_labels)

            print(i + 1), p_value, acc
            print cm
            print

            fl.write("%s\t%s\t%s\n" % (i + 1, p_value, acc))
Exemple #13
0
def train_model(args):
    workdir = args["workdir"]

    n_trees = args["trees"]
    if n_trees is None:
        print "Number of trees must be specified for training"
        sys.exit(1)

    n_resamples = args["resamples"]
    if n_resamples is None:
        print "Number of additional samples must be specified for training"
        sys.exit(1)

    features = read_features(workdir)

    snp_importances1 = features.snp_importances(n_trees, n_resamples).rank()
    snp_importances2 = features.snp_importances(n_trees, n_resamples).rank()

    write_snps(workdir, snp_importances1, "model1")
    write_snps(workdir, snp_importances2, "model2")
Exemple #14
0
def train(args):
    workdir = args.workdir

    models_dir = os.path.join(workdir, "models")
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    features = read_features(workdir)

    if args.model_type == "PCA":
        pca = PCA(n_components=args.n_components, whiten=True)
    elif args.model_type == "NMF":
        pca = NMF(n_components=args.n_components)
    else:
        raise Exception("Unknown model type %s" % args.model_type)

    projections = pca.fit_transform(features.feature_matrix)

    model = {MODEL_KEY: pca, PROJECTION_KEY: projections}

    model_fl = os.path.join(models_dir, "pca.pkl")
    joblib.dump(model, model_fl)
Exemple #15
0
if __name__ == "__main__":
    args = parseargs()

    if not os.path.exists(args.workdir):
        print "Work directory '%s' does not exist." % args.workdir
        sys.exit(1)

    stats_dir = os.path.join(args.workdir, "statistics")
    if not os.path.exists(stats_dir):
        os.makedirs(stats_dir)

    project_summary = deserialize(
        os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME))

    features = read_features(args.workdir)

    if args.mode == "pairwise":
        if project_summary.feature_encoding != "categories":
            print "Pairwise Cramer's V only works with the 'categories' feature encoding."
            sys.exit(1)

        set_one = None
        if args.subset_1:
            with open(args.subset_1) as fl:
                set_one = set()
                for ln in fl:
                    cols = ln.strip().split()
                    set_one.add((cols[0], cols[1]))

        set_two = None
Exemple #16
0
def run_likelihood_ratio_tests(args):
    if not os.path.exists(args.workdir):
        print "Work directory '%s' does not exist." % args.workdir
        sys.exit(1)

    stats_dir = os.path.join(args.workdir, OUTPUT_DIR)
    if not os.path.exists(stats_dir):
        os.makedirs(stats_dir)

    project_summary = deserialize(
        os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME))

    data_model = read_features(args.workdir)
    genotypes = data_model.feature_matrix

    n_iter = estimate_lr_iter(len(data_model.class_labels))

    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=False)

    testing_variables = np.array(data_model.class_labels).reshape(-1, 1)
    null_variables = None
    if args.variables_fl:
        selected_sample_ids, null_variables = parse_variables_file(
            args.variables_fl)

        selected_indices = select_samples(data_model, selected_sample_ids)

        # select subset and re-order
        genotypes = genotypes[selected_indices, :]
        testing_variables = testing_variables[selected_indices, :]

    N_COPIES = 3
    class_labels = None
    testing_features, null_features = prepare_model_variables(
        N_COPIES, testing_variables, null_variables)

    with open(os.path.join(stats_dir, OUTPUT_FLNAME), "w") as fl:
        next_output = 1
        for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
            pos_label, feature_idx = pair
            chrom, pos = pos_label

            pos_genotypes = genotypes[:, feature_idx]

            class_labels = prepare_class_labels(N_COPIES, pos_genotypes,
                                                class_labels)

            # since we make multiple copies of the original samples,
            # we need to scale the log loss so that it is correct for
            # the original sample size

            p_value = likelihood_ratio_test(testing_features,
                                            class_labels,
                                            lr,
                                            features_null=null_features,
                                            g_scaling_factor=1.0 / N_COPIES)

            if i == next_output:
                print i, "Position", pos_label, "has p-value", p_value
                next_output *= 2

            fl.write("\t".join([chrom, pos, str(p_value)]))
            fl.write("\n")