Esempio n. 1
0
def snp_association_tests(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projections = model[PROJECTION_KEY]

    data_model = read_features(workdir)

    n_iter = estimate_lr_iter(len(data_model.class_labels))
    # we set the intercept to the class ratios in the lr test function
    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=False)

    n_pcs = projections.shape[0]
    for pc in args.components:
        flname = os.path.join(analysis_dir,
                              "snp_pc_%s_association_tests.tsv" % pc)
        with open(flname, "w") as fl:
            next_output = 1
            for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
                snp_label, feature_idx = pair
                chrom, pos = snp_label

                snp_features = data_model.feature_matrix[:, feature_idx]
                triplet = generate_training_set(snp_features,
                                                projections[:, pc - 1])
                n_copies, class_labels, imputed_projections = triplet

                imputed_projections = imputed_projections.reshape(-1, 1)

                # since we make multiple copies of the original samples,
                # we need to scale the log loss so that it is correct for
                # the original sample size
                try:
                    p_value = likelihood_ratio_test(imputed_projections,
                                                    class_labels,
                                                    lr,
                                                    g_scaling_factor=1.0 /
                                                    n_copies)
                # in case of underflow or overflow in a badly-behaving model
                except ValueError:
                    p_value = 1.0

                if i == next_output:
                    print i, "SNP", snp_label, "and PC", pc, "has p-value", p_value
                    next_output *= 2

                fl.write("\t".join([chrom, pos, str(p_value)]))
                fl.write("\n")
Esempio n. 2
0
def run_lrtest_gt_dep(data_model, project_summary, args, stats_dir,
                      class_labels):
    genotypes = data_model.feature_matrix

    n_iter = estimate_lr_iter(len(class_labels))

    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=False)

    encoder = OneHotEncoder(sparse=False)

    flname = "snp_lrtests_gt.tsv"
    n_pops = len(set(class_labels))

    with open(os.path.join(stats_dir, flname), "w") as fl:
        next_output = 1

        headers = ["chrom", "pos", "snp_p_value"]
        fl.write("\t".join(headers))
        fl.write("\n")

        for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
            pos_label, feature_idx = pair
            chrom, pos = pos_label

            N_COPIES = 3
            pops, snp_genotypes = upsample_features(class_labels,
                                                    genotypes[:, feature_idx])

            # as we're using the genotypes as the labels,
            # they need to be one dimensional
            snp_genotypes = snp_genotypes.argmax(axis=1)

            p_value = 1.0
            if len(set(snp_genotypes)) > 1:
                # likewise, the pops need to 2D and one-hot encoded
                pops = pops.reshape(-1, 1)
                pops = encoder.fit_transform(pops)

                # since we make multiple copies of the original samples,
                # we need to scale the log loss so that it is correct for
                # the original sample size
                p_value = likelihood_ratio_test(pops,
                                                snp_genotypes,
                                                lr,
                                                g_scaling_factor=1.0 /
                                                N_COPIES)

            if i == next_output:
                print i, "Position", pos_label, "has p-value", p_value
                next_output *= 2

            fl.write("\t".join([chrom, pos, "%.2E" % p_value]))
            fl.write("\n")
Esempio n. 3
0
def run_likelihood_ratio_tests(features, project_summary, args, stats_dir):
    if len(set(features.class_labels)) != 2:
        raise ValueError, "LR Test currently only supports 2 populations."

    n_iter = estimate_lr_iter(len(features.class_labels))

    fit_intercept = False
    if args.intercept == "free-parameter":
        fit_intercept = True

    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=fit_intercept)

    flname = "snp_likelihood_ratio_tests.tsv"
    with open(os.path.join(stats_dir, flname), "w") as fl:
        next_output = 1
        for i, pair in enumerate(features.snp_feature_map.iteritems()):
            snp_label, feature_idx = pair
            chrom, pos = snp_label

            labels = np.array(features.class_labels)
            snp_features = features.feature_matrix[:, feature_idx]

            if args.training_set == "adjusted":
                training_labels, training_features = generate_training_set(
                    project_summary.feature_encoding, labels, snp_features)
            else:
                training_labels = labels
                training_features = snp_features

            set_intercept_to_class_prob = False
            if args.intercept == "class-probabilities":
                set_intercept_to_class_prob = True

            p_value = likelihood_ratio_test(
                (training_features, snp_features), (training_labels, labels),
                lr,
                set_intercept=set_intercept_to_class_prob)

            if i == next_output:
                print i, "SNP", snp_label, "has p-value", p_value
                next_output *= 2

            fl.write("\t".join([chrom, pos, str(p_value)]))
            fl.write("\n")
Esempio n. 4
0
def pop_association_tests(args):
    workdir = args.workdir

    analysis_dir = os.path.join(workdir, "analysis")
    if not os.path.exists(analysis_dir):
        os.makedirs(analysis_dir)

    project_summary = deserialize(os.path.join(workdir,
                                               PROJECT_SUMMARY_FLNAME))

    model_fl = os.path.join(workdir, "models", "pca.pkl")
    model = joblib.load(model_fl)
    projections = model[PROJECTION_KEY]

    data_model = read_features(workdir)

    n_iter = estimate_lr_iter(len(data_model.sample_labels))
    # we set the intercept to the class ratios in the lr test function
    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter * 10.,
                       fit_intercept=True)

    pvalues_fl = os.path.join(analysis_dir,
                              "population_pca_association_tests.tsv")
    class_labels = np.array(data_model.class_labels)
    with open(pvalues_fl, "w") as fl:
        for i in xrange(projections.shape[1]):
            features = projections[:, i].reshape(-1, 1)

            p_value = likelihood_ratio_test(features,
                                            class_labels,
                                            lr,
                                            set_intercept=False)

            lr.fit(features, class_labels)
            pred_labels = lr.predict(features)
            acc = 100. * accuracy_score(class_labels, pred_labels)

            cm = confusion_matrix(class_labels, pred_labels)

            print(i + 1), p_value, acc
            print cm
            print

            fl.write("%s\t%s\t%s\n" % (i + 1, p_value, acc))
Esempio n. 5
0
def run_likelihood_ratio_tests(args):
    if not os.path.exists(args.workdir):
        print "Work directory '%s' does not exist." % args.workdir
        sys.exit(1)

    stats_dir = os.path.join(args.workdir, OUTPUT_DIR)
    if not os.path.exists(stats_dir):
        os.makedirs(stats_dir)

    project_summary = deserialize(
        os.path.join(args.workdir, PROJECT_SUMMARY_FLNAME))

    data_model = read_features(args.workdir)
    genotypes = data_model.feature_matrix

    n_iter = estimate_lr_iter(len(data_model.class_labels))

    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=False)

    testing_variables = np.array(data_model.class_labels).reshape(-1, 1)
    null_variables = None
    if args.variables_fl:
        selected_sample_ids, null_variables = parse_variables_file(
            args.variables_fl)

        selected_indices = select_samples(data_model, selected_sample_ids)

        # select subset and re-order
        genotypes = genotypes[selected_indices, :]
        testing_variables = testing_variables[selected_indices, :]

    N_COPIES = 3
    class_labels = None
    testing_features, null_features = prepare_model_variables(
        N_COPIES, testing_variables, null_variables)

    with open(os.path.join(stats_dir, OUTPUT_FLNAME), "w") as fl:
        next_output = 1
        for i, pair in enumerate(data_model.snp_feature_map.iteritems()):
            pos_label, feature_idx = pair
            chrom, pos = pos_label

            pos_genotypes = genotypes[:, feature_idx]

            class_labels = prepare_class_labels(N_COPIES, pos_genotypes,
                                                class_labels)

            # since we make multiple copies of the original samples,
            # we need to scale the log loss so that it is correct for
            # the original sample size

            p_value = likelihood_ratio_test(testing_features,
                                            class_labels,
                                            lr,
                                            features_null=null_features,
                                            g_scaling_factor=1.0 / N_COPIES)

            if i == next_output:
                print i, "Position", pos_label, "has p-value", p_value
                next_output *= 2

            fl.write("\t".join([chrom, pos, str(p_value)]))
            fl.write("\n")
Esempio n. 6
0
def run_lrtest_pop_dep(features, project_summary, args, stats_dir,
                       class_labels):
    n_iter = estimate_lr_iter(len(class_labels))
    class_labels = np.array(class_labels)

    fit_intercept = False
    if args.intercept == "free-parameter":
        fit_intercept = True

    lr = SGDClassifier(penalty="l2",
                       loss="log",
                       n_iter=n_iter,
                       fit_intercept=fit_intercept)

    flname = "snp_lrtests_pop.tsv"
    with open(os.path.join(stats_dir, flname), "w") as fl:
        next_output = 1
        for i, pair in enumerate(features.snp_feature_map.iteritems()):
            snp_label, feature_idx = pair
            chrom, pos = snp_label

            snp_features = features.feature_matrix[:, feature_idx]

            if args.adjustment != "none":
                upsampled_labels, snp_features = upsample_features(
                    class_labels, snp_features)
            else:
                upsampled_labels = class_labels

            # remove columns that are all zeros since these
            # aren't true degrees of freedom.  prevents
            # under-estimating significance
            if args.remove_empty_columns:
                mask = np.all(snp_features == 0., axis=0)
                snp_features = snp_features[:, ~mask]

            set_intercept_to_class_prob = False
            if args.intercept == "class-probabilities":
                set_intercept_to_class_prob = True

            scaling_factor = 1.0
            if args.adjustment == "training-set":
                snp_features = (snp_features,
                                features.feature_matrix[:, feature_idx])
                upsampled_labels = (upsampled_labels, class_labels)
            elif args.adjustment == "scaling-factor":
                scaling_factor = 1.0 / 3.0

            p_value = likelihood_ratio_test(
                snp_features,
                upsampled_labels,
                lr,
                set_intercept=set_intercept_to_class_prob,
                g_scaling_factor=scaling_factor)

            if i == next_output:
                print i, "SNP", snp_label, "has p-value", p_value
                next_output *= 2

            fl.write("\t".join([chrom, pos, "%.2E" % p_value]))
            fl.write("\n")