Esempio n. 1
0
def run(items):
    """Perform detection of structural variations with lumpy.
    """
    if not all(
            utils.get_in(data, ("config", "algorithm", "aligner")) in
        ["bwa", "sentieon-bwa", "minimap2", False, None] for data in items):
        raise ValueError(
            "Require bwa or minimap2 alignment input for lumpy structural variation detection"
        )
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        full_bams.append(dd.get_align_bam(data))
        sr_bam, disc_bam = sshared.find_existing_split_discordants(data)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams,
                                          work_dir, items)
    gt_vcfs = {}
    # Retain paired samples with tumor/normal genotyped in one file
    if paired and paired.normal_name:
        batches = [[paired.tumor_data, paired.normal_data]]
    else:
        batches = [[x] for x in items]

    for batch_items in batches:
        for data in batch_items:
            gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(
                lumpy_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background(paired.tumor_name,
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs.get(dd.get_sample_name(data))
        if vcf_file:
            if dd.get_svprioritize(data):
                effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
            else:
                effects_vcf = None
            data["sv"].append({
                "variantcaller": "lumpy",
                "vrn_file": effects_vcf or vcf_file,
                "exclude_file": exclude_file
            })
        out.append(data)
    return out
Esempio n. 2
0
def run(items):
    """Perform detection of structural variations with lumpy.
    """
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(
        paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        full_bams.append(dd.get_align_bam(data))
        sr_bam, disc_bam = sshared.find_existing_split_discordants(data)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams,
                                          work_dir, items)
    lumpy_vcf = sshared.annotate_with_depth(lumpy_vcf, items)
    gt_vcfs = {}
    # Retain paired samples with tumor/normal genotyped in one file
    if paired and paired.normal_name:
        batches = [[paired.tumor_data, paired.normal_data]]
    else:
        batches = [[x] for x in items]

    for batch_items in batches:
        for data in batch_items:
            gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(
                lumpy_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background(paired.tumor_name,
                                        [paired.normal_name], gt_vcfs,
                                        paired.tumor_data)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs.get(dd.get_sample_name(data))
        if vcf_file:
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
            data["sv"].append({
                "variantcaller": "lumpy",
                "vrn_file": effects_vcf or vcf_file,
                "do_upload": upload_counts[vcf_file] ==
                0,  # only upload a single file per batch
                "exclude_file": exclude_file
            })
            upload_counts[vcf_file] += 1
        out.append(data)
    return out
Esempio n. 3
0
def run(items):
    """Perform detection of structural variations with lumpy.
    """
    paired = vcfutils.get_paired(items)
    work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0])
    previous_evidence = {}
    full_bams, sr_bams, disc_bams = [], [], []
    for data in items:
        full_bams.append(dd.get_align_bam(data))
        sr_bam, disc_bam = sshared.find_existing_split_discordants(data)
        sr_bams.append(sr_bam)
        disc_bams.append(disc_bam)
        cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir)
        previous_evidence[dd.get_sample_name(data)] = {}
        if cur_dels and utils.file_exists(cur_dels):
            previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels
        if cur_dups and utils.file_exists(cur_dups):
            previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups
    lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items)
    lumpy_vcf = sshared.annotate_with_depth(lumpy_vcf, items)
    gt_vcfs = {}
    # Retain paired samples with tumor/normal genotyped in one file
    if paired and paired.normal_name:
        batches = [[paired.tumor_data, paired.normal_data]]
    else:
        batches = [[x] for x in items]

    for batch_items in batches:
        for data in batch_items:
            gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(lumpy_vcf, data)
    if paired and paired.normal_name:
        gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data)
    out = []
    upload_counts = collections.defaultdict(int)
    for data in items:
        if "sv" not in data:
            data["sv"] = []
        vcf_file = gt_vcfs.get(dd.get_sample_name(data))
        if vcf_file:
            effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff")
            data["sv"].append({"variantcaller": "lumpy",
                               "vrn_file": effects_vcf or vcf_file,
                               "do_upload": upload_counts[vcf_file] == 0,  # only upload a single file per batch
                               "exclude_file": exclude_file})
            upload_counts[vcf_file] += 1
        out.append(data)
    return out