Esempio n. 1
0
def merge_calls_across_SD(calls, pipeline, max_distance_to_merge=50, min_sd_percent=0.5):
    out_calls = [] #pd.DataFrame(columns=list(calls.calls.columns) + ["merge_count"])
    chrom_probes = {chrom: pd.DataFrame(p.r.h5file.root.probes._f_getChild("probes_chr%d" % chrom).read()) for chrom in range(1,24)}
    for sampleID in set(calls.calls["sampleID"]):
        print sampleID
        sample_calls = CallTable(calls.calls[calls.calls["sampleID"] == sampleID])
        for chrom in set(sample_calls.calls.chromosome):
            chr_probes = chrom_probes[chrom]
            calls_to_merge = sample_calls.filter(lambda x: x["chromosome"] == chrom).calls.sort("start")
            if len(calls_to_merge) <= 1:
                # no calls to merge
                out_calls.append(calls_to_merge.ix[calls_to_merge.index[0]])#, ignore_index=True)
            else:
                # for each call, first check it is within the max_distance_to_merge
                # then check if the SD content between them is more than the min_sd_percent
                first_call = calls_to_merge.ix[calls_to_merge.index[0]]
                # start iterating on the second call
                for ix, second_call in calls_to_merge.ix[calls_to_merge.index[1:]].iterrows():
                    delta = second_call["start_exon"] - first_call["stop_exon"]
                    if (second_call["state"] == first_call["state"]) and (delta < max_distance_to_merge):
                        gap_sd_count = chr_probes.ix[xrange(first_call["stop_exon"],second_call["start_exon"])].isSegDup.sum()
                        gap_sd_percent = float(gap_sd_count)/delta
                        if gap_sd_percent >= min_sd_percent:
                            merged = merge_calls(first_call, second_call)
                            first_call = merged.copy()
                        else:
                            out_calls.append(first_call)#, ignore_index=True)
                            first_call = second_call
                    else:
                        # too far apart, do not merge
                        out_calls.append(first_call)#, ignore_index=True)
                        first_call = second_call                    
                #if not last_call_was_merged:
                out_calls.append(second_call)#, ignore_index=True)
    out_calls = pd.DataFrame(out_calls)
    return CallTable(out_calls)
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--infile", "-i", action="store", required=True)
    parser.add_argument("--cohort", action="store", required=True)
    parser.add_argument("--esp_infile", action="store", required=True)
    parser.add_argument("--outfile", "-o", action="store", required=True)
    parser.add_argument("--gamma", type=float, action="store", required=False, default=0.9)
    parser.add_argument("--cophenetic_cutoff", type=float, action="store", required=False, default=0.85)

    args = parser.parse_args()
    assert args.gamma <= 1, "Gamma must be <= 1.00"
    assert args.cophenetic_cutoff <= 1, "Cophenetic cutoffs must be <= 1.00"

    calls = CallTable(args.infile)
    esp_calls = CallTable(args.esp_infile)

    calls.calls["cohort"] = args.cohort
    esp_calls.calls["cohort"] = "ESP"

    calls.appendCalls(esp_calls)

    calls = calls.clusterCallsByCohort(gamma=args.gamma, cohort_field="cohort", cophenetic_cutoff=args.cophenetic_cutoff)

    #clean up calls table
    del calls.calls["cnvrID_ESP"]
    calls = CallTable(calls.calls.rename(columns={'cnvr_frequency_HSCR': 'cnvr_frequency', 'cnvrID_HSCR': 'cnvrID'}))
    # filter for original cohort and save
    calls.filter(lambda x: x["cohort"] == args.cohort).save(args.outfile)
Esempio n. 3
0
    OtherDupFilter = CallFilterTemplate(p,
                     "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/3copiesin27of34.bed",
                     name="Dup_overlap",
                     filter_type="overlap",
                     func=lambda x: x < 0.5)

    GeneAnnotation = CallFilterTemplate(p,
                     "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/hg19.refGene.bed",
                     name="RefSeq",
                     filter_type="name")

    def signalFilter(x):
        if x["num_probes"] <= 2:
            return np.abs(x["median_svdzrpkm"]) >= 1.5
        elif x["num_probes"] <= 5:
            return np.abs(x["median_svdzrpkm"]) >= 1
        else:
            return np.abs(x["median_svdzrpkm"]) >= 0.5

    calls = CallTable(args.call_file)
    calls = calls.filter(signalFilter)\
                 .filter(lambda x: x["probability"] > 0.99)\
                 .filter(SDFilter)\
                 .filter(PPGFilter)\
                 .filter(OtherDupFilter)\
                 .annotate(SDCount)\
                 .annotate(PPG_probe_count)

    calls.save(args.outfile)
    parser = argparse.ArgumentParser()
    parser.add_argument("--conifer_file", action="store", required=True)
    parser.add_argument("--call_file", action="store", required=True)
    parser.add_argument("--out_dir", action="store", required=True)
    parser.add_argument("--min_freq", type=int, action="store", required=False, default=0)
    parser.add_argument("--max_freq", type=int, action="store", required=False, default=30)
    parser.add_argument("--cnvrID", type=int, nargs="*", action="store", required=False, default=None)
    parser.add_argument("--cohort", action="store", required=False, default = "SSC")
    args = parser.parse_args()

    
    INHERITED_CODES = ['fa_to_both', 'fa_to_pro', 'fa_to_sib', 'mo_to_both', 'mo_to_pro', 'mo_to_sib']
    calls = CallTable(args.call_file)
    del calls.calls["cnvrID"]# = calls.calls["cnvrID_%s" % args.cohort]
    if isinstance(args.cnvrID, list):
        calls = calls.filter(lambda x: x["cnvrID_%s" % args.cohort] in args.cnvrID)

    calls = calls.filter(lambda x: (x["cnvr_frequency_%s" % args.cohort] >= args.min_freq) & (x["cnvr_frequency_%s" % args.cohort] < args.max_freq))

    calls.calls["familyID"] = map(lambda x: x.split(".")[:-1], calls.calls["sampleID"])
    offspring_calls = calls.filter(lambda x: x["sampleID"].endswith(("p1","s1","s2","s3")))
    parent_calls =  calls.filter(lambda x: x["sampleID"].endswith(("mo","fa")))
    sibling_calls = calls.filter(lambda x: x["sampleID"].endswith(("s1", "s2", "s3")))

    plotters = {}
    colors = {"fa": "b", "mo": "b", "sib": "g", "pro": "r"}
    for rel, codes in zip(["pro", "sib", "mo", "fa"], [["p1", "p2"], ["s1", "s2", "s3"], ["mo"], ["fa"]]):
        # create a plotter for each family member
        plotters[rel] = ConiferPlotter(args.conifer_file)
        # add the calls from that group
        t=ConiferPlotTrack(plotters[rel], data_in = calls.filter(lambda x: x["sampleID"].split(".")[1] in codes), 

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--conifer_file", action="store", required=True)
    parser.add_argument("--call_file", action="store", required=True)
    parser.add_argument("--outfile", "-o", action="store", required=True)
    parser.add_argument("--threshold", default=0.99)
    parser.add_argument("--sample_size", default=None)
    args = parser.parse_args()
    p = ConiferPipeline(args.conifer_file)

    calls = CallTable(args.call_file)
    calls.calls["familyID"] = map(lambda x: x.split('.')[0], calls.calls["sampleID"])
    new_calls = CallTable()
    offspring_calls = calls.filter(lambda x: x["sampleID"].endswith(("p", "s", "p1"))).calls
    #offspring_calls = calls.filter(lambda x: x["sampleID"][6] in ["p","s"]).calls
    parent_calls = calls.filter(lambda x: x["sampleID"].endswith(("m", "f", "mo", "fa"))).calls
    #parent_calls =  calls.filter(lambda x: x["sampleID"][6] in ["m","f"]).calls
    
    if args.sample_size:
        sample_size = int(args.sample_size)
    else:
        sample_size = len(p.samples)
    print sample_size
    total_calls = len(offspring_calls)
    cnt = 0
    for ix, c in offspring_calls.iterrows():
        # first check if parents already have calls:
        cnt += 1
        if cnt % 100 == 0:
        p,
        "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/3copiesin27of34.bed",
        name="Dup_overlap",
        filter_type="overlap",
        func=lambda x: x < 0.5)

    GeneAnnotation = CallFilterTemplate(
        p,
        "/net/eichler/vol8/home/nkrumm/REFERENCE_INFO/hg19.refGene.bed",
        name="RefSeq",
        filter_type="name")

    def signalFilter(x):
        if x["num_probes"] <= 2:
            return np.abs(x["median_svdzrpkm"]) >= 1.5
        elif x["num_probes"] <= 5:
            return np.abs(x["median_svdzrpkm"]) >= 1
        else:
            return np.abs(x["median_svdzrpkm"]) >= 0.5

    calls = CallTable(args.call_file)
    calls = calls.filter(signalFilter)\
                 .filter(lambda x: x["probability"] > 0.99)\
                 .filter(SDFilter)\
                 .filter(PPGFilter)\
                 .filter(OtherDupFilter)\
                 .annotate(SDCount)\
                 .annotate(PPG_probe_count)

    calls.save(args.outfile)