Python deserialize 예제들, asaph.newioutils.deserialize Python 예제들

예제 #1

0

파일 보기

파일: pca.py 프로젝트: UTbioinf/asaph

def cluster_samples(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]
    components = map(lambda idx: idx - 1, args.components)
    selected = projected[:, components]

    features = read_features(workdir)

    _, labels, inertia = k_means(selected, args.n_clusters, n_jobs=-2)

    fig_flname = os.path.join(analysis_dir,
                              "clusters_%s.tsv" % args.n_clusters)

    clusters = defaultdict(list)
    for name, cluster in zip(features.sample_labels, labels):
        clusters[cluster].append(name)

    with open(fig_flname, "w") as fl:
        for cluster, samples in clusters.iteritems():
            fl.write(str(cluster))
            fl.write(",")
            fl.write(",".join(samples))
            fl.write("\n")

예제 #2

0

파일 보기

파일: logistic_regression.py 프로젝트: DanielLeski/asaph

def analyze_rankings(args):
    workdir = args.workdir

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))
    all_snps = read_snps(workdir, args.method)

    n_models, common_feature_percentages = similarity_curves(
        args.thresholds, all_snps, project_summary.filtered_positions)

    analysis_flname = os.path.join(
        analysis_dir, "snp_ranking_overlaps_" + args.method + ".tsv")
    write_similarity_curves(analysis_flname, args.thresholds, n_models,
                            common_feature_percentages)

    flname_base = os.path.join(figures_dir,
                               "snp_ranking_overlaps_" + args.method)
    plot_similarity_curves(flname_base, args.thresholds, n_models,
                           common_feature_percentages)

예제 #3

0

파일 보기

def analyze_rankings(args):
    workdir = args["workdir"]

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))
    all_snps = read_rf_snps(workdir)

    n_models, common_feature_percentages = similarity_curves(
        args["thresholds"], all_snps, project_summary.filtered_positions)

    analysis_flname = os.path.join(analysis_dir, "snp_ranking_overlaps_rf.tsv")
    write_similarity_curves(analysis_flname, args["thresholds"], n_models,
                            common_feature_percentages)

    flname_base = os.path.join(figures_dir, "snp_ranking_overlaps_rf")
    plot_similarity_curves(flname_base, args["thresholds"], n_models,
                           common_feature_percentages)

    n_models, common_feature_counts, snp1_feature_counts, \
        snp2_feature_counts = sampled_snps_curves(all_snps)
    flname_base = os.path.join(figures_dir, "snp_counts")
    plot_sampled_snps_curves(flname_base, n_models, common_feature_counts,
                             snp1_feature_counts, snp2_feature_counts)

예제 #4

0

파일 보기

파일: pca.py 프로젝트: UTbioinf/asaph

def output_coordinates(args):
    workdir = args.workdir

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]
    selected = projected[:, map(lambda idx: idx - 1, args.selected_components)]

    features = read_features(workdir)

    with open(args.output_fl, "w") as fl:
        headers = ["sample", "population_index", "population_name"]
        headers.extend(map(str, args.selected_components))
        fl.write("\t".join(headers))
        fl.write("\n")

        for i in xrange(len(features.sample_labels)):
            sample = features.sample_labels[i]
            pop_idx = features.class_labels[i]
            pop_name = project_summary.population_names[pop_idx]
            line = [sample, str(pop_idx), pop_name]
            line.extend(map(str, selected[i, :]))
            fl.write("\t".join(line))
            fl.write("\n")

예제 #5

0

파일 보기

파일: pca.py 프로젝트: UTbioinf/asaph

def sweep_clusters(args):
    workdir = args.workdir

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]
    components = map(lambda idx: idx - 1, args.components)
    selected = projected[:, components]

    features = read_features(workdir)

    inertia_values = []
    for k in args.n_clusters:
        print "Clustering with %s clusters" % k
        _, _, inertia = k_means(selected, k, n_jobs=-2)
        inertia_values.append(inertia)

    plt.plot(args.n_clusters, inertia_values, "k.-")
    plt.xlabel("Number of Clusters", fontsize=16)
    plt.ylabel("Inertia", fontsize=16)

    fig_flname = os.path.join(figures_dir, "cluster_inertia")
    for dim in args.components:
        fig_flname += "_%s" % dim
    fig_flname += ".png"

    plt.savefig(fig_flname, DPI=300)

예제 #6

0

파일 보기

파일: pca.py 프로젝트: UTbioinf/asaph

def snp_association_tests(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projections = model[PROJECTION_KEY]

    data_model = read_features(workdir)

    n_iter = estimate_lr_iter(len(data_model.class_labels))
    # we set the intercept to the class ratios in the lr test function
    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=False)

    n_pcs = projections.shape[0]
    for pc in args.components:
        flname = os.path.join(analysis_dir,
                              "snp_pc_%s_association_tests.tsv" % pc)
        with open(flname, "w") as fl:
            next_output = 1
            for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
                snp_label, feature_idx = pair
                chrom, pos = snp_label

                snp_features = data_model.feature_matrix[:, feature_idx]
                triplet = generate_training_set(snp_features,
                                                projections[:, pc - 1])
                n_copies, class_labels, imputed_projections = triplet

                imputed_projections = imputed_projections.reshape(-1, 1)

                # since we make multiple copies of the original samples,
                # we need to scale the log loss so that it is correct for
                # the original sample size
                try:
                    p_value = likelihood_ratio_test(imputed_projections,
                                                    class_labels,
                                                    lr,
                                                    g_scaling_factor=1.0 /
                                                    n_copies)
                # in case of underflow or overflow in a badly-behaving model
                except ValueError:
                    p_value = 1.0

                if i == next_output:
                    print i, "SNP", snp_label, "and PC", pc, "has p-value", p_value
                    next_output *= 2

                fl.write("\t".join([chrom, pos, str(p_value)]))
                fl.write("\n")

예제 #7

0

파일 보기

파일: pca.py 프로젝트: UTbioinf/asaph

def plot_projections(args):
    if len(args.pairs) % 2 != 0:
        print "Error: PCs must be provided in pairs of 2"
        sys.exit(1)

    workdir = args.workdir

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projected = model[PROJECTION_KEY]

    features = read_features(workdir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    all_labels = set(features.class_labels)
    labels = np.array(features.class_labels, dtype=np.int32)
    populations = []
    for l in all_labels:
        pop = labels == l
        pop_name = project_summary.population_names[l]
        populations.append((pop, pop_name))

    for p1, p2 in pairwise(args.pairs):
        fig_flname = os.path.join(
            figures_dir, "pca_projection_%s_%s.png" % (str(p1), str(p2)))
        plt.clf()
        plt.grid(True)
        colors = ["m", "c", "k", "r", "g", "b"]
        markers = ["o"] * len(colors) + \
                  ["s"] * len(colors) + \
                  ["+"] * len(colors)
        for idx, (pop_idx, pop_name) in enumerate(populations):
            plt.scatter(projected[pop_idx, p1 - 1],
                        projected[pop_idx, p2 - 1],
                        color=colors[idx % len(colors)],
                        marker=markers[idx % len(markers)],
                        edgecolor="k",
                        alpha=0.7,
                        label=pop_name)
        plt.xlabel("Principal Component %s" % p1, fontsize=16)
        plt.ylabel("Principal Component %s" % p2, fontsize=16)
        if len(all_labels) > 1:
            plt.legend()
        plt.savefig(fig_flname, DPI=300)

예제 #8

0

파일 보기

파일: pca.py 프로젝트: DanielLeski/asaph

def snp_linreg_association_tests(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projections = model[PROJECTION_KEY]

    data_model = read_features(workdir)

    n_pcs = projections.shape[0]
    for pc in args.components:
        flname = os.path.join(analysis_dir,
                              "snp_pc_%s_linreg_assoc_tests.tsv" % pc)
        with open(flname, "w") as fl:
            next_output = 1
            for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
                snp_label, feature_idx = pair
                chrom, pos = snp_label

                snp_features = data_model.feature_matrix[:, feature_idx]

                # since we make multiple copies of the original samples,
                # we need to scale the log loss so that it is correct for
                # the original sample size
                triplet = snp_linreg_pvalues(snp_features, projections[:,
                                                                       pc - 1])
                snp_p_value, gt_ttest_pvalues, gt_normality_pvalues, gt_pred_ys = triplet

                if i == next_output:
                    print i, "SNP", snp_label, "and PC", pc, "has p-value", snp_p_value
                    next_output *= 2

                fl.write("\t".join([chrom, pos, str(snp_p_value)]))
                for j in xrange(3):
                    fl.write("\t")
                    fl.write(str(gt_ttest_pvalues[j]))
                    fl.write("\t")
                    fl.write(str(gt_normality_pvalues[j]))
                    fl.write("\t")
                    fl.write(str(gt_pred_ys[j]))
                fl.write("\n")

예제 #9

0

파일 보기

파일: logistic_regression.py 프로젝트: DanielLeski/asaph

def read_snps(basedir, method):
    model_dir = os.path.join(basedir, "models", "lr-" + method)
    if not os.path.exists(model_dir):
        return dict()

    model_dirs = glob.glob(os.path.join(model_dir, "*"))

    models = defaultdict(list)
    for model_dir in model_dirs:
        model_flnames = glob.glob(os.path.join(model_dir, "*"))
        for flname in model_flnames:
            snps = deserialize(flname)
            n_models = int(os.path.basename(os.path.dirname(flname)))
            models[n_models].append(snps)

    return models

예제 #10

0

파일 보기

파일: pca.py 프로젝트: UTbioinf/asaph

def pop_association_tests(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projections = model[PROJECTION_KEY]

    data_model = read_features(workdir)

    n_iter = estimate_lr_iter(len(data_model.sample_labels))
    # we set the intercept to the class ratios in the lr test function
    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter * 10.,
                       fit_intercept=True)

    pvalues_fl = os.path.join(analysis_dir,
                              "population_pca_association_tests.tsv")
    class_labels = np.array(data_model.class_labels)
    with open(pvalues_fl, "w") as fl:
        for i in xrange(projections.shape[1]):
            features = projections[:, i].reshape(-1, 1)

            p_value = likelihood_ratio_test(features,
                                            class_labels,
                                            lr,
                                            set_intercept=False)

            lr.fit(features, class_labels)
            pred_labels = lr.predict(features)
            acc = 100. * accuracy_score(class_labels, pred_labels)

            cm = confusion_matrix(class_labels, pred_labels)

            print(i + 1), p_value, acc
            print cm
            print

            fl.write("%s\t%s\t%s\n" % (i + 1, p_value, acc))

예제 #11

0

파일 보기

파일: pca.py 프로젝트: UTbioinf/asaph

def analyze_weights(args):
    workdir = args.workdir

    figures_dir = os.path.join(workdir, "figures")
    if not os.path.exists(figures_dir):
        os.makedirs(figures_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    pca = model[MODEL_KEY]

    for i, w in enumerate(args.weights):
        plt.clf()
        seaborn.distplot(w * pca.components_[i, :], kde=False)
        plt.xlabel("Feature Weights", fontsize=16)
        plt.ylabel("Count(Features)", fontsize=16)
        plot_fl = os.path.join(figures_dir,
                               "pca_feature_weights_pc%s.png" % (i + 1))
        plt.savefig(plot_fl, DPI=300)

예제 #12

0

파일 보기

파일: cramers_v.py 프로젝트: UTbioinf/asaph

    return parser.parse_args()


if __name__ == "__main__":
    args = parseargs()

    if not os.path.exists(args.workdir):
        print "Work directory '%s' does not exist." % args.workdir
        sys.exit(1)

    stats_dir = os.path.join(args.workdir, "statistics")
    if not os.path.exists(stats_dir):
        os.makedirs(stats_dir)

    project_summary = deserialize(
        os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME))

    features = read_features(args.workdir)

    if args.mode == "pairwise":
        if project_summary.feature_encoding != "categories":
            print "Pairwise Cramer's V only works with the 'categories' feature encoding."
            sys.exit(1)

        set_one = None
        if args.subset_1:
            with open(args.subset_1) as fl:
                set_one = set()
                for ln in fl:
                    cols = ln.strip().split()
                    set_one.add((cols[0], cols[1]))

예제 #13

0

파일 보기

def run_likelihood_ratio_tests(args):
    if not os.path.exists(args.workdir):
        print "Work directory '%s' does not exist." % args.workdir
        sys.exit(1)

    stats_dir = os.path.join(args.workdir, OUTPUT_DIR)
    if not os.path.exists(stats_dir):
        os.makedirs(stats_dir)

    project_summary = deserialize(
        os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME))

    data_model = read_features(args.workdir)
    genotypes = data_model.feature_matrix

    n_iter = estimate_lr_iter(len(data_model.class_labels))

    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=False)

    testing_variables = np.array(data_model.class_labels).reshape(-1, 1)
    null_variables = None
    if args.variables_fl:
        selected_sample_ids, null_variables = parse_variables_file(
            args.variables_fl)

        selected_indices = select_samples(data_model, selected_sample_ids)

        # select subset and re-order
        genotypes = genotypes[selected_indices, :]
        testing_variables = testing_variables[selected_indices, :]

    N_COPIES = 3
    class_labels = None
    testing_features, null_features = prepare_model_variables(
        N_COPIES, testing_variables, null_variables)

    with open(os.path.join(stats_dir, OUTPUT_FLNAME), "w") as fl:
        next_output = 1
        for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
            pos_label, feature_idx = pair
            chrom, pos = pos_label

            pos_genotypes = genotypes[:, feature_idx]

            class_labels = prepare_class_labels(N_COPIES, pos_genotypes,
                                                class_labels)

            # since we make multiple copies of the original samples,
            # we need to scale the log loss so that it is correct for
            # the original sample size

            p_value = likelihood_ratio_test(testing_features,
                                            class_labels,
                                            lr,
                                            features_null=null_features,
                                            g_scaling_factor=1.0 / N_COPIES)

            if i == next_output:
                print i, "Position", pos_label, "has p-value", p_value
                next_output *= 2

            fl.write("\t".join([chrom, pos, str(p_value)]))
            fl.write("\n")