Esempio n. 1
0
def get_variants_in_samples(gemini_db, samples, annotations, min_allele_freq, min_alt_depth, min_depth, max_aaf_all, somatic=False):
    """
    Returns a dataframe with variants from all samples.
    """

    if somatic:
        get_vars_fn = gem_ops.get_somatic_vars_in_sample
    else:
        get_vars_fn = gem_ops.get_vars_in_sample

    print >> sys.stderr, "Samples to process: ", ", ".join(samples)
    all_vars = []
    for sample in samples:
        start = time.time()
        sample_vars = get_vars_fn(gemini_db, annotations, sample, min_allele_freq=min_allele_freq,
                                  min_alt_depth=min_alt_depth, min_depth=min_depth, max_aaf_all=max_aaf_all)
        end = time.time()
        all_vars.append(sample_vars)
        print >> sys.stderr, sample, len(sample_vars), end-start

    # Combine all variants together and reset index so adding metadata is easy.
    all_vars_df = gem_ops.convert_cols(pd.concat(all_vars))
    all_vars_df.reset_index(inplace=True, drop=True)

    # Using sample column, create and populate columns for id, plate, tissue, and replicate.
    sample_attrs = pd.DataFrame(list(all_vars_df["sample"].apply(lambda s: split_id(s))), columns=["id", "plate", "tissue", "replicate"])
    for i, col in enumerate(["id", "plate", "tissue", "replicate"]):
        all_vars_df.insert(i+1, col, pd.Series())
        all_vars_df[col] = sample_attrs[col]

    return all_vars_df
Esempio n. 2
0
        # Write results to file.
        if sample_pattern == ".*":
            sample_pattern = "all"
        out_filename = "find_vars_results_%s_minaf%.2f_ad%i_d%i.txt" % (sample_pattern, min_allele_freq, min_alt_depth, min_depth)
        out_file = open(out_filename, "w")
        out_file.write( all_vars_df.to_csv(sep="\t", index=False, float_format='%.3f') )
        out_file.close()

        print "Wrote results to file %s" % out_filename

    elif operation == "augment_vars":
        # Augment variants with updated and joint information.

        # Read results into dataframe.
        results_df = gem_ops.convert_cols( pd.read_csv(results_file, sep="\t") )
        results_df = filter_and_augment_variants(results_df, min_allele_freq, min_alt_depth, min_depth, max_num_het, tissue, add_joint_cols)

        # Print augmented results.
        augmented_out_file = open("augmented_" + results_file, "w")
        augmented_out_file.write(results_df.to_csv(sep="\t", index=False))
        augmented_out_file.close()

        print "Wrote augmented variants to file %s" % augmented_out_file

        # Print joint variants.
        if add_joint:
            joint_out_file = open("joint_" + results_file, "w")
            print_joint_variants(results_df, joint_out_file, all_cols=True)
            joint_out_file.close()