Esempio n. 1
0
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath,
                 bam_sorted_fpath, log_path, err_path, cov_fpath,
                 physical_cov_fpath):
    raw_cov_fpath = cov_fpath + '_raw'
    if not is_non_empty_file(cov_fpath):
        logger.info('  Calculating reads coverage...')
        if not is_non_empty_file(raw_cov_fpath):
            if not is_non_empty_file(bam_sorted_fpath):
                qutils.call_subprocess([
                    sambamba_fpath('sambamba'), 'sort', '-t',
                    str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath
                ],
                                       stdout=open(log_path, 'a'),
                                       stderr=open(err_path, 'a'))
            chr_len_fpath = get_chr_len_fpath(ref_fpath)
            qutils.call_subprocess([
                bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam',
                bam_sorted_fpath, '-g', chr_len_fpath
            ],
                                   stdout=open(raw_cov_fpath, 'w'),
                                   stderr=open(err_path, 'a'))
            qutils.assert_file_exists(raw_cov_fpath, 'coverage file')
        proceed_cov_file(raw_cov_fpath, cov_fpath)
    if not is_non_empty_file(physical_cov_fpath):
        physical_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath,
                                                   ref_name, bam_fpath,
                                                   log_path, err_path,
                                                   physical_cov_fpath)
    return cov_fpath, physical_cov_fpath
Esempio n. 2
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None):
    ref = qutils.name_from_fpath(cur_ref_fpath)
    ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
    ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam')
    ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted')
    ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed')
    if os.path.getsize(ref_sam_fpath) < 1024 * 1024:  # TODO: make it better (small files will cause Manta crush -- "not enough reads...")
        logger.info('  SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024))
        return None
    if is_non_empty_file(ref_bed_fpath):
        logger.info('  Using existing Manta BED-file: ' + ref_bed_fpath)
        return ref_bed_fpath
    if not os.path.exists(ref_bamsorted_fpath + '.bam'):
        qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-bS', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'),
                               stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([samtools_fpath('samtools'), 'sort', ref_bam_fpath, ref_bamsorted_fpath],
                               stderr=open(err_path, 'a'), logger=logger)
    if not is_non_empty_file(ref_bamsorted_fpath + '.bam.bai'):
        qutils.call_subprocess([samtools_fpath('samtools'), 'index', ref_bamsorted_fpath + '.bam'],
                               stderr=open(err_path, 'a'), logger=logger)
    if not is_non_empty_file(cur_ref_fpath + '.fai'):
        qutils.call_subprocess([samtools_fpath('samtools'), 'faidx', cur_ref_fpath],
                               stderr=open(err_path, 'a'), logger=logger)
    vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta')
    found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz')
    unpacked_SV_fpath = found_SV_fpath + '.unpacked'
    if not is_non_empty_file(found_SV_fpath):
        if os.path.exists(vcfoutput_dirpath):
            shutil.rmtree(vcfoutput_dirpath, ignore_errors=True)
        os.makedirs(vcfoutput_dirpath)
        qutils.call_subprocess([config_manta_fpath, '--normalBam', ref_bamsorted_fpath + '.bam',
                                '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath],
                               stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
        if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')):
            return None
        qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)],
                               stderr=open(err_path, 'a'), logger=logger)
    if not is_non_empty_file(unpacked_SV_fpath):
        cmd = 'gunzip -c %s' % found_SV_fpath
        qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'),
                               stderr=open(err_path, 'a'), logger=logger)
    from manta import vcfToBedpe
    vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w'))
    return ref_bed_fpath
Esempio n. 3
0
def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath,
                          log_path, err_path, cov_fpath):
    if not os.path.exists(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    if not is_non_empty_file(cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = os.path.join(output_dirpath,
                                          ref_name + '.filtered.bam')
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'view', '-t',
            str(qconfig.max_threads), '-h', '-f', 'bam', '-F',
            'proper_pair and not supplementary and not duplicate', bam_fpath
        ],
                               stdout=open(bam_filtered_fpath, 'w'),
                               stderr=open(err_path, 'a'))
        ## sort by read names
        bam_filtered_sorted_fpath = os.path.join(
            output_dirpath, ref_name + '.filtered.sorted.bam')
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'sort', '-t',
            str(qconfig.max_threads), '-n', '-o', bam_filtered_sorted_fpath,
            bam_filtered_fpath
        ],
                               stdout=open(log_path, 'a'),
                               stderr=open(err_path, 'a'))
        bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe')
        qutils.call_subprocess([
            bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath,
            '-bedpe'
        ],
                               stdout=open(bedpe_fpath, 'w'),
                               stderr=open(err_path, 'a'))
        raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed')
        with open(bedpe_fpath, 'r') as bedpe:
            with open(raw_bed_fpath, 'w') as bed_file:
                for line in bedpe:
                    fs = line.split()
                    bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n']))
        sorted_bed_fpath = os.path.join(output_dirpath,
                                        ref_name + '.sorted.bed')
        qutils.call_subprocess(
            [bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath],
            stdout=open(sorted_bed_fpath, 'w'),
            stderr=open(err_path, 'a'))
        chr_len_fpath = get_chr_len_fpath(ref_fpath)
        raw_cov_fpath = cov_fpath + '_raw'
        qutils.call_subprocess([
            bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i',
            sorted_bed_fpath, '-g', chr_len_fpath
        ],
                               stdout=open(raw_cov_fpath, 'w'),
                               stderr=open(err_path, 'a'))
        proceed_cov_file(raw_cov_fpath, cov_fpath)
    return cov_fpath
Esempio n. 4
0
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels,
                         reads_fpaths, output_dirpath, res_path, log_path,
                         err_path):
    ref_name = qutils.name_from_fpath(main_ref_fpath)
    sam_fpath = os.path.join(output_dirpath, ref_name + '.sam')
    bam_fpath = os.path.join(output_dirpath, ref_name + '.bam')
    bam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted')
    sam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted.sam')
    bed_fpath = os.path.join(res_path, ref_name + '.bed')

    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    logger.info('  ' + 'Pre-processing for searching structural variations...')
    logger.info('  ' + 'Logging to %s...' % err_path)
    if is_non_empty_file(sam_fpath):
        logger.info('  Using existing SAM-file: ' + sam_fpath)
    else:
        logger.info('  Running Bowtie2...')
        abs_reads_fpaths = [
        ]  # use absolute paths because we will change workdir
        for reads_fpath in reads_fpaths:
            abs_reads_fpaths.append(os.path.abspath(reads_fpath))

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        cmd = [bin_fpath('bowtie2-build'), main_ref_fpath, ref_name]
        qutils.call_subprocess(cmd,
                               stdout=open(log_path, 'a'),
                               stderr=open(err_path, 'a'),
                               logger=logger)

        cmd = bin_fpath('bowtie2') + ' -x ' + ref_name + ' -1 ' + abs_reads_fpaths[0] + ' -2 ' + abs_reads_fpaths[1] + ' -S ' + \
              sam_fpath + ' --no-unal -a -p %s' % str(qconfig.max_threads)
        qutils.call_subprocess(shlex.split(cmd),
                               stdout=open(log_path, 'a'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
        logger.info('  Done.')
        os.chdir(prev_dir)
        if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0:
            logger.error('  Failed running Bowtie2 for the reference. See ' +
                         log_path + ' for information.')
            logger.info('  Failed searching structural variations.')
            return None
    logger.info('  Sorting SAM-file...')
    if is_non_empty_file(sam_sorted_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        qutils.call_subprocess([
            samtools_fpath('samtools'), 'view', '-@',
            str(qconfig.max_threads), '-bS', sam_fpath
        ],
                               stdout=open(bam_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
        qutils.call_subprocess([
            samtools_fpath('samtools'), 'sort', '-@',
            str(qconfig.max_threads), bam_fpath, bam_sorted_fpath
        ],
                               stderr=open(err_path, 'a'),
                               logger=logger)
        qutils.call_subprocess([
            samtools_fpath('samtools'), 'view', '-@',
            str(qconfig.max_threads), bam_sorted_fpath + '.bam'
        ],
                               stdout=open(sam_sorted_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
    if meta_ref_fpaths:
        logger.info('  Splitting SAM-file by references...')
    headers = []
    seq_name_length = {}
    with open(sam_fpath) as sam_file:
        for line in sam_file:
            if not line.startswith('@'):
                break
            if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                seq_name = line.split('\tSN:')[1].split('\t')[0]
                seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                seq_name_length[seq_name] = seq_length
            headers.append(line.strip())
    need_ref_splitting = False
    if meta_ref_fpaths:
        ref_files = {}
        for cur_ref_fpath in meta_ref_fpaths:
            ref = qutils.name_from_fpath(cur_ref_fpath)
            new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
            if is_non_empty_file(new_ref_sam_fpath):
                logger.info('    Using existing split SAM-file for %s: %s' %
                            (ref, new_ref_sam_fpath))
                ref_files[ref] = None
            else:
                new_ref_sam_file = open(new_ref_sam_fpath, 'w')
                new_ref_sam_file.write(headers[0] + '\n')
                chrs = []
                for h in (h for h in headers
                          if h.startswith('@SQ') and 'SN:' in h):
                    seq_name = h.split('\tSN:')[1].split('\t')[0]
                    if seq_name in ref_labels and ref_labels[seq_name] == ref:
                        new_ref_sam_file.write(h + '\n')
                        chrs.append(seq_name)
                new_ref_sam_file.write(headers[-1] + '\n')
                ref_files[ref] = new_ref_sam_file
                need_ref_splitting = True
    deletions = []
    trivial_deletions_fpath = os.path.join(output_dirpath,
                                           qconfig.trivial_deletions_fname)
    logger.info(
        '  Looking for trivial deletions (long zero-covered fragments)...')
    need_trivial_deletions = True
    if os.path.exists(trivial_deletions_fpath):
        need_trivial_deletions = False
        logger.info('    Using existing file: ' + trivial_deletions_fpath)

    if need_trivial_deletions or need_ref_splitting:
        with open(sam_sorted_fpath) as sam_file:
            cur_deletion = None
            for line in sam_file:
                mapping = Mapping.parse(line)
                if mapping:
                    # common case: continue current deletion (potential) on the same reference
                    if cur_deletion and cur_deletion.ref == mapping.ref:
                        if cur_deletion.next_bad is None:  # previous mapping was in region BEFORE 0-covered fragment
                            # just passed 0-covered fragment
                            if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP:
                                cur_deletion.set_next_bad(mapping)
                                if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                    cur_deletion.set_next_good(mapping)
                                    if cur_deletion.is_valid():
                                        deletions.append(cur_deletion)
                                    cur_deletion = QuastDeletion(
                                        mapping.ref).set_prev_good(mapping)
                            # continue region BEFORE 0-covered fragment
                            elif mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                cur_deletion.set_prev_good(mapping)
                            else:
                                cur_deletion.set_prev_bad(mapping)
                        else:  # previous mapping was in region AFTER 0-covered fragment
                            # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping
                            if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP:
                                if cur_deletion.is_valid(
                                ):  # add previous fragment's deletion if needed
                                    deletions.append(cur_deletion)
                                cur_deletion = QuastDeletion(
                                    mapping.ref).set_prev_bad(
                                        position=cur_deletion.next_bad_end)
                            # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above)
                            if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                cur_deletion.set_next_good(mapping)
                                if cur_deletion.is_valid():
                                    deletions.append(cur_deletion)
                                cur_deletion = QuastDeletion(
                                    mapping.ref).set_prev_good(mapping)
                            else:
                                cur_deletion.set_next_bad_end(mapping)
                    # special case: just started or just switched to the next reference
                    else:
                        if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                            cur_deletion.set_next_good(
                                position=seq_name_length[cur_deletion.ref])
                            if cur_deletion.is_valid():
                                deletions.append(cur_deletion)
                        cur_deletion = QuastDeletion(
                            mapping.ref).set_prev_good(mapping)

                    if need_ref_splitting:
                        cur_ref = ref_labels[mapping.ref]
                        if mapping.ref_next.strip(
                        ) == '=' or cur_ref == ref_labels[mapping.ref_next]:
                            if ref_files[cur_ref] is not None:
                                ref_files[cur_ref].write(line)
            if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                cur_deletion.set_next_good(
                    position=seq_name_length[cur_deletion.ref])
                if cur_deletion.is_valid():
                    deletions.append(cur_deletion)
        if need_ref_splitting:
            for ref_handler in ref_files.values():
                if ref_handler is not None:
                    ref_handler.close()
        if need_trivial_deletions:
            logger.info('  Trivial deletions: %d found' % len(deletions))
            logger.info('    Saving to: ' + trivial_deletions_fpath)
            with open(trivial_deletions_fpath, 'w') as f:
                for deletion in deletions:
                    f.write(str(deletion) + '\n')

    if os.path.exists(config_manta_fpath):
        manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths,
                                              output_dirpath, err_path)
        qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath)
    elif os.path.exists(trivial_deletions_fpath):
        shutil.copy(trivial_deletions_fpath, bed_fpath)

    if os.path.exists(bed_fpath):
        logger.main_info('  Structural variations saved to ' + bed_fpath)
        return bed_fpath
    else:
        logger.main_info('  Failed searching structural variations.')
        return None
Esempio n. 5
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None):
    ref = qutils.name_from_fpath(cur_ref_fpath)
    ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
    ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam')
    ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted')
    ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(
        output_dirpath, ref + '.bed')
    if os.path.getsize(
            ref_sam_fpath
    ) < 1024 * 1024:  # TODO: make it better (small files will cause Manta crush -- "not enough reads...")
        logger.info('  SAM file is too small for Manta (%d Kb), skipping..' %
                    (os.path.getsize(ref_sam_fpath) // 1024))
        return None
    if is_non_empty_file(ref_bed_fpath):
        logger.info('  Using existing Manta BED-file: ' + ref_bed_fpath)
        return ref_bed_fpath
    if not os.path.exists(ref_bamsorted_fpath + '.bam'):
        qutils.call_subprocess(
            [samtools_fpath('samtools'), 'view', '-bS', ref_sam_fpath],
            stdout=open(ref_bam_fpath, 'w'),
            stderr=open(err_path, 'a'),
            logger=logger)
        qutils.call_subprocess([
            samtools_fpath('samtools'), 'sort', ref_bam_fpath,
            ref_bamsorted_fpath
        ],
                               stderr=open(err_path, 'a'),
                               logger=logger)
    if not is_non_empty_file(ref_bamsorted_fpath + '.bam.bai'):
        qutils.call_subprocess([
            samtools_fpath('samtools'), 'index', ref_bamsorted_fpath + '.bam'
        ],
                               stderr=open(err_path, 'a'),
                               logger=logger)
    if not is_non_empty_file(cur_ref_fpath + '.fai'):
        qutils.call_subprocess(
            [samtools_fpath('samtools'), 'faidx', cur_ref_fpath],
            stderr=open(err_path, 'a'),
            logger=logger)
    vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta')
    found_SV_fpath = os.path.join(vcfoutput_dirpath,
                                  'results/variants/diploidSV.vcf.gz')
    unpacked_SV_fpath = found_SV_fpath + '.unpacked'
    if not is_non_empty_file(found_SV_fpath):
        if os.path.exists(vcfoutput_dirpath):
            shutil.rmtree(vcfoutput_dirpath, ignore_errors=True)
        os.makedirs(vcfoutput_dirpath)
        qutils.call_subprocess([
            config_manta_fpath, '--normalBam', ref_bamsorted_fpath + '.bam',
            '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath
        ],
                               stdout=open(err_path, 'a'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
        if not os.path.exists(os.path.join(vcfoutput_dirpath,
                                           'runWorkflow.py')):
            return None
        qutils.call_subprocess([
            os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local',
            '-j',
            str(qconfig.max_threads)
        ],
                               stderr=open(err_path, 'a'),
                               logger=logger)
    if not is_non_empty_file(unpacked_SV_fpath):
        cmd = 'gunzip -c %s' % found_SV_fpath
        qutils.call_subprocess(shlex.split(cmd),
                               stdout=open(unpacked_SV_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
    from manta import vcfToBedpe
    vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w'))
    return ref_bed_fpath
Esempio n. 6
0
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path):
    ref_name = qutils.name_from_fpath(main_ref_fpath)
    sam_fpath = os.path.join(output_dirpath, ref_name + '.sam')
    bam_fpath = os.path.join(output_dirpath, ref_name + '.bam')
    bam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted')
    sam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted.sam')
    bed_fpath = os.path.join(res_path, ref_name + '.bed')

    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    logger.info('  ' + 'Pre-processing for searching structural variations...')
    logger.info('  ' + 'Logging to %s...' % err_path)
    if is_non_empty_file(sam_fpath):
        logger.info('  Using existing SAM-file: ' + sam_fpath)
    else:
        logger.info('  Running Bowtie2...')
        abs_reads_fpaths = []  # use absolute paths because we will change workdir
        for reads_fpath in reads_fpaths:
            abs_reads_fpaths.append(os.path.abspath(reads_fpath))

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        cmd = [bin_fpath('bowtie2-build'), main_ref_fpath, ref_name]
        qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger)

        cmd = bin_fpath('bowtie2') + ' -x ' + ref_name + ' -1 ' + abs_reads_fpaths[0] + ' -2 ' + abs_reads_fpaths[1] + ' -S ' + \
              sam_fpath + ' --no-unal -a -p %s' % str(qconfig.max_threads)
        qutils.call_subprocess(shlex.split(cmd), stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
        logger.info('  Done.')
        os.chdir(prev_dir)
        if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0:
            logger.error('  Failed running Bowtie2 for the reference. See ' + log_path + ' for information.')
            logger.info('  Failed searching structural variations.')
            return None
    logger.info('  Sorting SAM-file...')
    if is_non_empty_file(sam_sorted_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), '-bS', sam_fpath], stdout=open(bam_fpath, 'w'),
                               stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([samtools_fpath('samtools'), 'sort', '-@', str(qconfig.max_threads), bam_fpath, bam_sorted_fpath],
                               stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), bam_sorted_fpath + '.bam'], stdout=open(sam_sorted_fpath, 'w'),
                               stderr=open(err_path, 'a'), logger=logger)
    if meta_ref_fpaths:
        logger.info('  Splitting SAM-file by references...')
    headers = []
    seq_name_length = {}
    with open(sam_fpath) as sam_file:
        for line in sam_file:
            if not line.startswith('@'):
                break
            if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                seq_name = line.split('\tSN:')[1].split('\t')[0]
                seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                seq_name_length[seq_name] = seq_length
            headers.append(line.strip())
    need_ref_splitting = False
    if meta_ref_fpaths:
        ref_files = {}
        for cur_ref_fpath in meta_ref_fpaths:
            ref = qutils.name_from_fpath(cur_ref_fpath)
            new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
            if is_non_empty_file(new_ref_sam_fpath):
                logger.info('    Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath))
                ref_files[ref] = None
            else:
                new_ref_sam_file = open(new_ref_sam_fpath, 'w')
                new_ref_sam_file.write(headers[0] + '\n')
                chrs = []
                for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h):
                    seq_name = h.split('\tSN:')[1].split('\t')[0]
                    if seq_name in ref_labels and ref_labels[seq_name] == ref:
                        new_ref_sam_file.write(h + '\n')
                        chrs.append(seq_name)
                new_ref_sam_file.write(headers[-1] + '\n')
                ref_files[ref] = new_ref_sam_file
                need_ref_splitting = True
    deletions = []
    trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname)
    logger.info('  Looking for trivial deletions (long zero-covered fragments)...')
    need_trivial_deletions = True
    if os.path.exists(trivial_deletions_fpath):
        need_trivial_deletions = False
        logger.info('    Using existing file: ' + trivial_deletions_fpath)

    if need_trivial_deletions or need_ref_splitting:
        with open(sam_sorted_fpath) as sam_file:
            cur_deletion = None
            for line in sam_file:
                mapping = Mapping.parse(line)
                if mapping:
                    # common case: continue current deletion (potential) on the same reference
                    if cur_deletion and cur_deletion.ref == mapping.ref:
                        if cur_deletion.next_bad is None:  # previous mapping was in region BEFORE 0-covered fragment
                            # just passed 0-covered fragment
                            if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP:
                                cur_deletion.set_next_bad(mapping)
                                if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                    cur_deletion.set_next_good(mapping)
                                    if cur_deletion.is_valid():
                                        deletions.append(cur_deletion)
                                    cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping)
                            # continue region BEFORE 0-covered fragment
                            elif mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                cur_deletion.set_prev_good(mapping)
                            else:
                                cur_deletion.set_prev_bad(mapping)
                        else:  # previous mapping was in region AFTER 0-covered fragment
                            # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping
                            if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP:
                                if cur_deletion.is_valid():   # add previous fragment's deletion if needed
                                    deletions.append(cur_deletion)
                                cur_deletion = QuastDeletion(mapping.ref).set_prev_bad(position=cur_deletion.next_bad_end)
                            # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above)
                            if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                cur_deletion.set_next_good(mapping)
                                if cur_deletion.is_valid():
                                    deletions.append(cur_deletion)
                                cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping)
                            else:
                                cur_deletion.set_next_bad_end(mapping)
                    # special case: just started or just switched to the next reference
                    else:
                        if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                            cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref])
                            if cur_deletion.is_valid():
                                deletions.append(cur_deletion)
                        cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping)

                    if need_ref_splitting:
                        cur_ref = ref_labels[mapping.ref]
                        if mapping.ref_next.strip() == '=' or cur_ref == ref_labels[mapping.ref_next]:
                            if ref_files[cur_ref] is not None:
                                ref_files[cur_ref].write(line)
            if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref])
                if cur_deletion.is_valid():
                    deletions.append(cur_deletion)
        if need_ref_splitting:
            for ref_handler in ref_files.values():
                if ref_handler is not None:
                    ref_handler.close()
        if need_trivial_deletions:
            logger.info('  Trivial deletions: %d found' % len(deletions))
            logger.info('    Saving to: ' + trivial_deletions_fpath)
            with open(trivial_deletions_fpath, 'w') as f:
                for deletion in deletions:
                    f.write(str(deletion) + '\n')

    if os.path.exists(config_manta_fpath):
        manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path)
        qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath)
    elif os.path.exists(trivial_deletions_fpath):
        shutil.copy(trivial_deletions_fpath, bed_fpath)

    if os.path.exists(bed_fpath):
        logger.main_info('  Structural variations saved to ' + bed_fpath)
        return bed_fpath
    else:
        logger.main_info('  Failed searching structural variations.')
        return None
Esempio n. 7
0
def run_processing_reads(main_ref_fpath,
                         meta_ref_fpaths,
                         ref_labels,
                         reads_fpaths,
                         output_dirpath,
                         res_path,
                         log_path,
                         err_path,
                         sam_fpath=None,
                         bam_fpath=None,
                         bed_fpath=None):
    ref_name = qutils.name_from_fpath(main_ref_fpath)

    if not sam_fpath and bam_fpath:
        sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam')
    else:
        sam_fpath = sam_fpath or os.path.join(output_dirpath,
                                              ref_name + '.sam')
    bam_fpath = bam_fpath or get_safe_fpath(output_dirpath,
                                            sam_fpath[:-4] + '.bam')
    sam_sorted_fpath = get_safe_fpath(output_dirpath,
                                      add_suffix(sam_fpath, 'sorted'))
    bam_sorted_fpath = get_safe_fpath(output_dirpath,
                                      add_suffix(bam_fpath, 'sorted'))

    bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed')
    cov_fpath = os.path.join(res_path, ref_name + '.cov')
    physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov')

    if qconfig.no_sv:
        logger.info(
            '  Will not search Structural Variations (--fast or --no-sv is specified)'
        )
        bed_fpath = None
    elif is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
    if is_non_empty_file(cov_fpath):
        is_correct_file = check_cov_file(cov_fpath)
        if is_correct_file:
            logger.info('  Using existing reads coverage file: ' + cov_fpath)
    if is_non_empty_file(physical_cov_fpath):
        logger.info('  Using existing physical coverage file: ' +
                    physical_cov_fpath)
    if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and is_non_empty_file(
            cov_fpath) and is_non_empty_file(physical_cov_fpath):
        return bed_fpath, cov_fpath, physical_cov_fpath

    logger.info('  ' + 'Pre-processing reads...')
    logger.info('  ' + 'Logging to %s...' % err_path)
    if is_non_empty_file(sam_fpath):
        logger.info('  Using existing SAM-file: ' + sam_fpath)
    elif is_non_empty_file(bam_fpath):
        logger.info('  Using existing BAM-file: ' + bam_fpath)
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'view', '-t',
            str(qconfig.max_threads), '-h', bam_fpath
        ],
                               stdout=open(sam_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
    else:
        logger.info('  Running BWA...')
        # use absolute paths because we will change workdir
        sam_fpath = os.path.abspath(sam_fpath)
        abs_reads_fpaths = []
        for reads_fpath in reads_fpaths:
            abs_reads_fpaths.append(os.path.abspath(reads_fpath))

        if len(abs_reads_fpaths) != 2:
            logger.error(
                '  You should specify files with forward and reverse reads.')
            logger.info('  Failed searching structural variations.')
            return None, None, None

        if not qconfig.no_check:
            if not paired_reads_names_are_equal(reads_fpaths, logger):
                logger.info(
                    '  Read names are discordant, skipping reads analysis!')
                return None, None, None

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath]
        if os.path.getsize(
                main_ref_fpath
        ) > 2 * 1024**3:  # if reference size bigger than 2GB
            cmd += ['-a', 'bwtsw']
        qutils.call_subprocess(cmd,
                               stdout=open(log_path, 'a'),
                               stderr=open(err_path, 'a'),
                               logger=logger)

        cmd = bwa_fpath('bwa') + ' mem -t ' + str(
            qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[
                0] + ' ' + abs_reads_fpaths[1]

        qutils.call_subprocess(shlex.split(cmd),
                               stdout=open(sam_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
        logger.info('  Done.')
        os.chdir(prev_dir)
        if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0:
            logger.error('  Failed running BWA for the reference. See ' +
                         log_path + ' for information.')
            logger.info('  Failed searching structural variations.')
            return None, None, None
    logger.info('  Sorting SAM-file...')
    if (is_non_empty_file(sam_sorted_fpath)
            and all_read_names_correct(sam_sorted_fpath)
        ) and is_non_empty_file(bam_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        correct_sam_fpath = os.path.join(output_dirpath, ref_name +
                                         '.sam.correct')  # write in output dir
        clean_read_names(sam_fpath, correct_sam_fpath)
        bam_fpath = os.path.join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'view', '-t',
            str(qconfig.max_threads), '-h', '-f', 'bam', '-S',
            correct_sam_fpath
        ],
                               stdout=open(bam_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'sort', '-t',
            str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath
        ],
                               stderr=open(err_path, 'a'),
                               logger=logger)
        qutils.call_subprocess([
            sambamba_fpath('sambamba'), 'view', '-t',
            str(qconfig.max_threads), '-h', bam_sorted_fpath
        ],
                               stdout=open(sam_sorted_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)

    if not is_non_empty_file(cov_fpath) or not is_non_empty_file(
            physical_cov_fpath):
        cov_fpath, physical_cov_fpath = get_coverage(
            output_dirpath, main_ref_fpath, ref_name, bam_fpath,
            bam_sorted_fpath, log_path, err_path, cov_fpath,
            physical_cov_fpath)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_name_length = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_name_length[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        if meta_ref_fpaths:
            ref_files = {}
            for cur_ref_fpath in meta_ref_fpaths:
                ref = qutils.name_from_fpath(cur_ref_fpath)
                new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam')
                if is_non_empty_file(new_ref_sam_fpath):
                    logger.info(
                        '    Using existing split SAM-file for %s: %s' %
                        (ref, new_ref_sam_fpath))
                    ref_files[ref] = None
                else:
                    new_ref_sam_file = open(new_ref_sam_fpath, 'w')
                    new_ref_sam_file.write(headers[0] + '\n')
                    chrs = []
                    for h in (h for h in headers
                              if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[
                                seq_name] == ref:
                            new_ref_sam_file.write(h + '\n')
                            chrs.append(seq_name)
                    new_ref_sam_file.write(headers[-1] + '\n')
                    ref_files[ref] = new_ref_sam_file
                    need_ref_splitting = True
        deletions = []
        trivial_deletions_fpath = os.path.join(output_dirpath,
                                               qconfig.trivial_deletions_fname)
        logger.info(
            '  Looking for trivial deletions (long zero-covered fragments)...')
        need_trivial_deletions = True
        if os.path.exists(trivial_deletions_fpath):
            need_trivial_deletions = False
            logger.info('    Using existing file: ' + trivial_deletions_fpath)

        if need_trivial_deletions or need_ref_splitting:
            with open(sam_sorted_fpath) as sam_file:
                cur_deletion = None
                for line in sam_file:
                    mapping = Mapping.parse(line)
                    if mapping:
                        # common case: continue current deletion (potential) on the same reference
                        if cur_deletion and cur_deletion.ref == mapping.ref:
                            if cur_deletion.next_bad is None:  # previous mapping was in region BEFORE 0-covered fragment
                                # just passed 0-covered fragment
                                if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP:
                                    cur_deletion.set_next_bad(mapping)
                                    if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                        cur_deletion.set_next_good(mapping)
                                        if cur_deletion.is_valid():
                                            deletions.append(cur_deletion)
                                        cur_deletion = QuastDeletion(
                                            mapping.ref).set_prev_good(mapping)
                                # continue region BEFORE 0-covered fragment
                                elif mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                    cur_deletion.set_prev_good(mapping)
                                else:
                                    cur_deletion.set_prev_bad(mapping)
                            else:  # previous mapping was in region AFTER 0-covered fragment
                                # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping
                                if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP:
                                    if cur_deletion.is_valid(
                                    ):  # add previous fragment's deletion if needed
                                        deletions.append(cur_deletion)
                                    cur_deletion = QuastDeletion(
                                        mapping.ref).set_prev_bad(
                                            position=cur_deletion.next_bad_end)
                                # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above)
                                if mapping.mapq >= Mapping.MIN_MAP_QUALITY:
                                    cur_deletion.set_next_good(mapping)
                                    if cur_deletion.is_valid():
                                        deletions.append(cur_deletion)
                                    cur_deletion = QuastDeletion(
                                        mapping.ref).set_prev_good(mapping)
                                else:
                                    cur_deletion.set_next_bad_end(mapping)
                        # special case: just started or just switched to the next reference
                        else:
                            if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                                cur_deletion.set_next_good(
                                    position=seq_name_length[cur_deletion.ref])
                                if cur_deletion.is_valid():
                                    deletions.append(cur_deletion)
                            cur_deletion = QuastDeletion(
                                mapping.ref).set_prev_good(mapping)

                        if need_ref_splitting:
                            cur_ref = ref_labels[mapping.ref]
                            if mapping.ref_next.strip(
                            ) == '=' or cur_ref == ref_labels[
                                    mapping.ref_next]:
                                if ref_files[cur_ref] is not None:
                                    ref_files[cur_ref].write(line)
                if cur_deletion and cur_deletion.ref in seq_name_length:  # switched to the next ref
                    cur_deletion.set_next_good(
                        position=seq_name_length[cur_deletion.ref])
                    if cur_deletion.is_valid():
                        deletions.append(cur_deletion)
            if need_ref_splitting:
                for ref_handler in ref_files.values():
                    if ref_handler is not None:
                        ref_handler.close()
            if need_trivial_deletions:
                logger.info('  Trivial deletions: %d found' % len(deletions))
                logger.info('    Saving to: ' + trivial_deletions_fpath)
                with open(trivial_deletions_fpath, 'w') as f:
                    for deletion in deletions:
                        f.write(str(deletion) + '\n')

        if isfile(config_manta_fpath):
            try:
                manta_sv_fpath = search_sv_with_manta(main_ref_fpath,
                                                      meta_ref_fpaths,
                                                      output_dirpath, err_path)
                qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath],
                                 bed_fpath)
            except:
                pass
        if os.path.exists(
                trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info(
            '  Coverage distribution along the reference genome is in ' +
            cov_fpath)
    else:
        logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath