Exemple #1
0
def merge_perchrom_vcfs(job, perchrom_vcfs, tool_name, univ_options):
    """
    Merge per-chromosome vcf files into a single genome level vcf.

    :param dict perchrom_vcfs: Dictionary with chromosome name as key and fsID of the corresponding
           vcf as value
    :param str tool_name: Name of the tool that generated the vcfs
    :returns: fsID for the merged vcf
    :rtype: toil.fileStore.FileID
    """
    work_dir = os.getcwd()
    input_files = {
        ''.join([chrom, '.vcf']): jsid
        for chrom, jsid in perchrom_vcfs.items()
    }
    input_files = get_files_from_filestore(job,
                                           input_files,
                                           work_dir,
                                           docker=False)
    first = True
    with open(''.join([work_dir, '/', 'all_merged.vcf']), 'w') as outvcf:
        for chromvcfname in chrom_sorted(
            [x.rstrip('.vcf') for x in input_files.keys()]):
            with open(input_files[chromvcfname + '.vcf'], 'r') as infile:
                for line in infile:
                    line = line.strip()
                    if line.startswith('#'):
                        if first:
                            print(line, file=outvcf)
                        continue
                    first = False
                    print(line, file=outvcf)
    output_file = job.fileStore.writeGlobalFile(outvcf.name)
    export_results(job,
                   output_file,
                   outvcf.name,
                   univ_options,
                   subfolder='mutations/' + tool_name)

    job.fileStore.logToMaster('Ran merge_perchrom_vcfs for %s successfully' %
                              tool_name)
    return output_file
Exemple #2
0
def merge_perchrom_mutations(job, chrom, mutations, univ_options):
    """
    Merge the mutation calls for a single chromosome.

    :param str chrom: Chromosome to process
    :param dict mutations: dict of dicts of the various mutation caller names as keys, and a dict of
           per chromosome job store ids for vcfs as value
    :param dict univ_options: Dict of universal options used by almost all tools
    :returns fsID for vcf contaning merged calls for the given chromosome
    :rtype: toil.fileStore.FileID
    """
    work_dir = os.getcwd()
    from protect.mutation_calling.muse import process_muse_vcf
    from protect.mutation_calling.mutect import process_mutect_vcf
    from protect.mutation_calling.radia import process_radia_vcf
    from protect.mutation_calling.somaticsniper import process_somaticsniper_vcf
    from protect.mutation_calling.strelka import process_strelka_vcf
    mutations.pop('indels')
    mutations['strelka_indels'] = mutations['strelka']['indels']
    mutations['strelka_snvs'] = mutations['strelka']['snvs']
    vcf_processor = {
        'snvs': {
            'mutect': process_mutect_vcf,
            'muse': process_muse_vcf,
            'radia': process_radia_vcf,
            'somaticsniper': process_somaticsniper_vcf,
            'strelka_snvs': process_strelka_vcf
        },
        'indels': {
            'strelka_indels': process_strelka_vcf
        }
    }
    #                 'fusions': lambda x: None,
    #                 'indels': lambda x: None}
    # For now, let's just say 2 out of n need to call it.
    # num_preds = len(mutations)
    # majority = int((num_preds + 0.5) / 2)
    majority = {'snvs': 2, 'indels': 1}

    accepted_hits = defaultdict(dict)

    for mut_type in vcf_processor.keys():
        # Get input files
        perchrom_mutations = {
            caller:
            vcf_processor[mut_type][caller](job, mutations[caller][chrom],
                                            work_dir, univ_options)
            for caller in vcf_processor[mut_type]
        }
        # Process the strelka key
        perchrom_mutations['strelka'] = perchrom_mutations['strelka_' +
                                                           mut_type]
        perchrom_mutations.pop('strelka_' + mut_type)
        # Read in each file to a dict
        vcf_lists = {
            caller: read_vcf(vcf_file)
            for caller, vcf_file in perchrom_mutations.items()
        }
        all_positions = list(set(itertools.chain(*vcf_lists.values())))
        for position in sorted(all_positions):
            hits = {
                caller: position in vcf_lists[caller]
                for caller in perchrom_mutations.keys()
            }
            if sum(hits.values()) >= majority[mut_type]:
                callers = ','.join(
                    [caller for caller, hit in hits.items() if hit])
                assert position[1] not in accepted_hits[position[0]]
                accepted_hits[position[0]][position[1]] = (position[2],
                                                           position[3],
                                                           callers)

    with open(''.join([work_dir, '/', chrom, '.vcf']), 'w') as outfile:
        print('##fileformat=VCFv4.0', file=outfile)
        print(
            '##INFO=<ID=callers,Number=.,Type=String,Description=List of supporting callers.',
            file=outfile)
        print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=outfile)
        for chrom in chrom_sorted(accepted_hits.keys()):
            for position in sorted(accepted_hits[chrom]):
                print(chrom,
                      position,
                      '.',
                      accepted_hits[chrom][position][0],
                      accepted_hits[chrom][position][1],
                      '.',
                      'PASS',
                      'callers=' + accepted_hits[chrom][position][2],
                      sep='\t',
                      file=outfile)
    fsid = job.fileStore.writeGlobalFile(outfile.name)
    export_results(job,
                   fsid,
                   outfile.name,
                   univ_options,
                   subfolder='mutations/merged')
    return fsid