def merge_calls_across_SD(calls, pipeline, max_distance_to_merge=50, min_sd_percent=0.5):
    out_calls = [] #pd.DataFrame(columns=list(calls.calls.columns) + ["merge_count"])
    chrom_probes = {chrom: pd.DataFrame(p.r.h5file.root.probes._f_getChild("probes_chr%d" % chrom).read()) for chrom in range(1,24)}
    for sampleID in set(calls.calls["sampleID"]):
        print sampleID
        sample_calls = CallTable(calls.calls[calls.calls["sampleID"] == sampleID])
        for chrom in set(sample_calls.calls.chromosome):
            chr_probes = chrom_probes[chrom]
            calls_to_merge = sample_calls.filter(lambda x: x["chromosome"] == chrom).calls.sort("start")
            if len(calls_to_merge) <= 1:
                # no calls to merge
                out_calls.append(calls_to_merge.ix[calls_to_merge.index[0]])#, ignore_index=True)
            else:
                # for each call, first check it is within the max_distance_to_merge
                # then check if the SD content between them is more than the min_sd_percent
                first_call = calls_to_merge.ix[calls_to_merge.index[0]]
                # start iterating on the second call
                for ix, second_call in calls_to_merge.ix[calls_to_merge.index[1:]].iterrows():
                    delta = second_call["start_exon"] - first_call["stop_exon"]
                    if (second_call["state"] == first_call["state"]) and (delta < max_distance_to_merge):
                        gap_sd_count = chr_probes.ix[xrange(first_call["stop_exon"],second_call["start_exon"])].isSegDup.sum()
                        gap_sd_percent = float(gap_sd_count)/delta
                        if gap_sd_percent >= min_sd_percent:
                            merged = merge_calls(first_call, second_call)
                            first_call = merged.copy()
                        else:
                            out_calls.append(first_call)#, ignore_index=True)
                            first_call = second_call
                    else:
                        # too far apart, do not merge
                        out_calls.append(first_call)#, ignore_index=True)
                        first_call = second_call                    
                #if not last_call_was_merged:
                out_calls.append(second_call)#, ignore_index=True)
    out_calls = pd.DataFrame(out_calls)
    return CallTable(out_calls)
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--infile", "-i", action="store", required=True)
    parser.add_argument("--cohort", action="store", required=True)
    parser.add_argument("--esp_infile", action="store", required=True)
    parser.add_argument("--outfile", "-o", action="store", required=True)
    parser.add_argument("--gamma", type=float, action="store", required=False, default=0.9)
    parser.add_argument("--cophenetic_cutoff", type=float, action="store", required=False, default=0.85)

    args = parser.parse_args()
    assert args.gamma <= 1, "Gamma must be <= 1.00"
    assert args.cophenetic_cutoff <= 1, "Cophenetic cutoffs must be <= 1.00"

    calls = CallTable(args.infile)
    esp_calls = CallTable(args.esp_infile)

    calls.calls["cohort"] = args.cohort
    esp_calls.calls["cohort"] = "ESP"

    calls.appendCalls(esp_calls)

    calls = calls.clusterCallsByCohort(gamma=args.gamma, cohort_field="cohort", cophenetic_cutoff=args.cophenetic_cutoff)

    #clean up calls table
    del calls.calls["cnvrID_ESP"]
    calls = CallTable(calls.calls.rename(columns={'cnvr_frequency_HSCR': 'cnvr_frequency', 'cnvrID_HSCR': 'cnvrID'}))
    # filter for original cohort and save
    calls.filter(lambda x: x["cohort"] == args.cohort).save(args.outfile)
Example #3
0
    OtherDupFilter = CallFilterTemplate(p,
                     "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/3copiesin27of34.bed",
                     name="Dup_overlap",
                     filter_type="overlap",
                     func=lambda x: x < 0.5)

    GeneAnnotation = CallFilterTemplate(p,
                     "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/hg19.refGene.bed",
                     name="RefSeq",
                     filter_type="name")

    def signalFilter(x):
        if x["num_probes"] <= 2:
            return np.abs(x["median_svdzrpkm"]) >= 1.5
        elif x["num_probes"] <= 5:
            return np.abs(x["median_svdzrpkm"]) >= 1
        else:
            return np.abs(x["median_svdzrpkm"]) >= 0.5

    calls = CallTable(args.call_file)
    calls = calls.filter(signalFilter)\
                 .filter(lambda x: x["probability"] > 0.99)\
                 .filter(SDFilter)\
                 .filter(PPGFilter)\
                 .filter(OtherDupFilter)\
                 .annotate(SDCount)\
                 .annotate(PPG_probe_count)

    calls.save(args.outfile)
Example #4
0
from conifertools import ConiferPipeline, CallTable, CallFilterTemplate
import numpy as np
import argparse
import pandas as pd
import os

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--call_file", action="store", required=True)
    parser.add_argument("--outfile", "-o", action="store", required=True)
    args = parser.parse_args()
    
    calls=CallTable(args.call_file)
    samples = pd.read_csv("/net/eichler/vol20/projects/epi4k/nobackups/araja/epi4k_exome/xhmm/DATA/conifer_xhmm_overlap_epp.bed", sep="\t")
    calls.calls = pd.merge(samples,calls.calls)
    calls.save(args.outfile)
Example #5
0
from conifertools import CallTable
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--infile", "-i", action="store", required=True)
    parser.add_argument("--outfile", "-o", action="store", required=True)
    parser.add_argument("--gamma",
                        type=float,
                        action="store",
                        required=False,
                        default=0.9)
    parser.add_argument("--cophenetic_cutoff",
                        type=float,
                        action="store",
                        required=False,
                        default=0.85)
    args = parser.parse_args()

    assert args.gamma <= 1, "Gamma must be <= 1.00"
    assert args.cophenetic_cutoff <= 1, "All cophenetic cutoffs must be <= 1.00"

    calls = CallTable(args.infile)

    calls = calls.clusterCalls(gamma=args.gamma,
                               cophenetic_cutoff=args.cophenetic_cutoff)

    calls.save(args.outfile)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--conifer_file", action="store", required=True)
    parser.add_argument("--call_file", action="store", required=True)
    parser.add_argument("--out_dir", action="store", required=True)
    parser.add_argument("--min_freq", type=int, action="store", required=False, default=0)
    parser.add_argument("--max_freq", type=int, action="store", required=False, default=30)
    parser.add_argument("--cnvrID", type=int, nargs="*", action="store", required=False, default=None)
    parser.add_argument("--cohort", action="store", required=False, default = "SSC")
    args = parser.parse_args()

    
    INHERITED_CODES = ['fa_to_both', 'fa_to_pro', 'fa_to_sib', 'mo_to_both', 'mo_to_pro', 'mo_to_sib']
    calls = CallTable(args.call_file)
    del calls.calls["cnvrID"]# = calls.calls["cnvrID_%s" % args.cohort]
    if isinstance(args.cnvrID, list):
        calls = calls.filter(lambda x: x["cnvrID_%s" % args.cohort] in args.cnvrID)

    calls = calls.filter(lambda x: (x["cnvr_frequency_%s" % args.cohort] >= args.min_freq) & (x["cnvr_frequency_%s" % args.cohort] < args.max_freq))

    calls.calls["familyID"] = map(lambda x: x.split(".")[:-1], calls.calls["sampleID"])
    offspring_calls = calls.filter(lambda x: x["sampleID"].endswith(("p1","s1","s2","s3")))
    parent_calls =  calls.filter(lambda x: x["sampleID"].endswith(("mo","fa")))
    sibling_calls = calls.filter(lambda x: x["sampleID"].endswith(("s1", "s2", "s3")))

    plotters = {}
    colors = {"fa": "b", "mo": "b", "sib": "g", "pro": "r"}
    for rel, codes in zip(["pro", "sib", "mo", "fa"], [["p1", "p2"], ["s1", "s2", "s3"], ["mo"], ["fa"]]):
        # create a plotter for each family member
Example #7
0
from conifertools import ConiferPipeline, CallTable
import argparse
import numpy as np


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input_call_files", nargs="+")
    parser.add_argument("--outfile", "-o", action="store", required=True)
    args = parser.parse_args()
    calls = CallTable()
    for filename in args.input_call_files:
        calls.appendCalls(CallTable(filename))

    calls.save(args.outfile)
from conifertools import CallTable
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--infile", "-i", action="store", required=True)
    parser.add_argument("--outfile", "-o", action="store", required=True)
    parser.add_argument("--gamma", type=float, action="store", required=False, default=0.9)
    parser.add_argument("--cophenetic_cutoff", type=float, action="store", required=False, default=0.85)
    args = parser.parse_args()

    assert args.gamma <= 1, "Gamma must be <= 1.00"
    assert max(args.cophenetic_cutoff) <= 1, "All cophenetic cutoffs must be <= 1.00"

    calls = CallTable(args.infile)

    calls = calls.clusterCalls(gamma=args.gamma,
                               cophenetic_cutoff=args.cophenetic_cutoff)

    calls.save(args.outfile)
    
    return {"mo": (tdist.cdf(p_from_mother), np.median(data["mo"].rpkm[exon_start:exon_stop+1])), 
            "fa": (tdist.cdf(p_from_father), np.median(data["fa"].rpkm[exon_start:exon_stop+1]))}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--conifer_file", action="store", required=True)
    parser.add_argument("--call_file", action="store", required=True)
    parser.add_argument("--outfile", "-o", action="store", required=True)
    parser.add_argument("--threshold", default=0.99)
    parser.add_argument("--sample_size", default=None)
    args = parser.parse_args()
    p = ConiferPipeline(args.conifer_file)

    calls = CallTable(args.call_file)
    calls.calls["familyID"] = map(lambda x: x.split('.')[0], calls.calls["sampleID"])
    new_calls = CallTable()
    offspring_calls = calls.filter(lambda x: x["sampleID"].endswith(("p", "s", "p1"))).calls
    #offspring_calls = calls.filter(lambda x: x["sampleID"][6] in ["p","s"]).calls
    parent_calls = calls.filter(lambda x: x["sampleID"].endswith(("m", "f", "mo", "fa"))).calls
    #parent_calls =  calls.filter(lambda x: x["sampleID"][6] in ["m","f"]).calls
    
    if args.sample_size:
        sample_size = int(args.sample_size)
    else:
        sample_size = len(p.samples)
    print sample_size
    total_calls = len(offspring_calls)
    cnt = 0
    for ix, c in offspring_calls.iterrows():
from conifertools import ConiferPipeline, CallTable, CallFilterTemplate
import argparse


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("conifer_file")
    parser.add_argument("call_file")
    parser.add_argument("out_file")

    args = parser.parse_args()

    calls = CallTable(args.call_file)

    p = ConiferPipeline(args.conifer_file)

    GeneAnnotation = CallFilterTemplate(p,
                 "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/hg19.refGene.bed",
                 name="RefSeq",
                 filter_type="name")


    calls = calls.annotate(GeneAnnotation)
    print_cols = ["cnvrID_SSC", "sampleID", "chromosome", "start", "stop", "state", "size_bp", 
                  "cnvr_frequency_SSC", "cohort",
                  "median_svdzrpkm", "num_probes", "RefSeq"]
    calls.save(args.out_file, cols=print_cols)


Example #11
0
from conifertools import CallTable
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--call_files",
                        nargs="+",
                        action="store",
                        required=True)
    parser.add_argument("--outfile", action="store", required=True)
    parser.add_argument("--cols",
                        nargs="+",
                        action="store",
                        default=[],
                        required=False)
    args = parser.parse_args()

    calls = CallTable(args.call_files)

    if len(args.cols) > 0:
        calls.calls[args.cols]\
             .sort(["chromosome", "start"])\
             .to_csv(args.outfile, sep="\t")
    else:
        calls.calls\
             .sort(["chromosome", "start"])\
             .to_csv(args.outfile, sep="\t")
        p,
        "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/3copiesin27of34.bed",
        name="Dup_overlap",
        filter_type="overlap",
        func=lambda x: x < 0.5)

    GeneAnnotation = CallFilterTemplate(
        p,
        "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/hg19.refGene.bed",
        name="RefSeq",
        filter_type="name")

    def signalFilter(x):
        if x["num_probes"] <= 2:
            return np.abs(x["median_svdzrpkm"]) >= 1.5
        elif x["num_probes"] <= 5:
            return np.abs(x["median_svdzrpkm"]) >= 1
        else:
            return np.abs(x["median_svdzrpkm"]) >= 0.5

    calls = CallTable(args.call_file)
    calls = calls.filter(signalFilter)\
                 .filter(lambda x: x["probability"] > 0.99)\
                 .filter(SDFilter)\
                 .filter(PPGFilter)\
                 .filter(OtherDupFilter)\
                 .annotate(SDCount)\
                 .annotate(PPG_probe_count)

    calls.save(args.outfile)