Example #1
0
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath,
                 bam_sorted_fpath, log_path, err_path, cov_fpath,
                 physical_cov_fpath, correct_chr_names):
    raw_cov_fpath = cov_fpath + '_raw'
    chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names)
    if not is_non_empty_file(cov_fpath):
        logger.info('  Calculating reads coverage...')
        if not is_non_empty_file(raw_cov_fpath):
            if not is_non_empty_file(bam_sorted_fpath):
                qutils.call_subprocess([
                    sambamba_fpath('sambamba'), 'sort', '-t',
                    str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath
                ],
                                       stdout=open(log_path, 'a'),
                                       stderr=open(err_path, 'a'),
                                       logger=logger)
            qutils.call_subprocess([
                bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam',
                bam_sorted_fpath, '-g', chr_len_fpath
            ],
                                   stdout=open(raw_cov_fpath, 'w'),
                                   stderr=open(err_path, 'a'),
                                   logger=logger)
            qutils.assert_file_exists(raw_cov_fpath, 'coverage file')
        proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names)
    if not is_non_empty_file(physical_cov_fpath):
        raw_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath,
                                              ref_name, bam_fpath, log_path,
                                              err_path, physical_cov_fpath,
                                              chr_len_fpath)
        proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names)
    return cov_fpath, physical_cov_fpath
Example #2
0
def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath,
                   blast_res_fpath, blast_check_fpath, blast_threads):
    logger.info('  ' + 'processing ' + label)
    blast_query_fpath = contigs_fpath
    compress_ext = ['.gz', '.gzip', '.bz2', '.bzip2', '.zip']
    if any(contigs_fpath.endswith(ext) for ext in compress_ext):
        logger.info('  ' + 'unpacking ' + label)
        unpacked_fpath = os.path.join(
            corrected_dirpath,
            os.path.basename(contigs_fpath) + '.unpacked')
        with _get_fasta_file_handler(contigs_fpath) as f_in:
            with open(unpacked_fpath, 'w') as f_out:
                for l in f_in:
                    f_out.write(l)
        blast_query_fpath = unpacked_fpath
    res_fpath = get_blast_output_fpath(blast_res_fpath, label)
    check_fpath = get_blast_output_fpath(blast_check_fpath, label)
    cmd = get_blast_fpath('blastn') + (
        ' -query %s -db %s -outfmt 7 -num_threads %s' %
        (blast_query_fpath, db_fpath, blast_threads))
    qutils.call_subprocess(shlex.split(cmd),
                           stdout=open(res_fpath, 'w'),
                           stderr=open(err_fpath, 'a'),
                           logger=logger)
    logger.info('  ' + 'BLAST results for %s are saved to %s...' %
                (label, res_fpath))
    with open(check_fpath, 'w') as check_file:
        check_file.writelines('Assembly: %s md5 checksum: %s\n' %
                              (contigs_fpath, md5(contigs_fpath)))
Example #3
0
def merge_bed(repeats_fpath, uncovered_fpath, insert_size, output_dirpath,
              err_path):
    combined_bed_fpath = join(output_dirpath, 'skipped_regions.bed')
    with open(combined_bed_fpath, 'w') as out:
        if exists(repeats_fpath):
            with open(repeats_fpath) as in_f:
                for line in in_f:
                    l = line.split('\t')
                    repeat_len = int(l[2]) - int(l[1])
                    if repeat_len >= insert_size:
                        out.write(line)
        if exists(uncovered_fpath):
            with open(uncovered_fpath) as in_f:
                for line in in_f:
                    out.write(line)

    sorted_bed_fpath = add_suffix(combined_bed_fpath, 'sorted')
    qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', combined_bed_fpath],
                           stdout=open(sorted_bed_fpath, 'w'),
                           stderr=open(err_path, 'a'),
                           logger=logger)
    merged_bed_fpath = add_suffix(combined_bed_fpath, 'merged')
    qutils.call_subprocess(
        [bedtools_fpath('bedtools'), 'merge', '-i', sorted_bed_fpath],
        stdout=open(merged_bed_fpath, 'w'),
        stderr=open(err_path, 'a'),
        logger=logger)
    return merged_bed_fpath
Example #4
0
def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, cov_fpath, chr_len_fpath):
    if not os.path.exists(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    raw_cov_fpath = cov_fpath + '_raw'
    if not is_non_empty_file(raw_cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = os.path.join(output_dirpath, ref_name + '.filtered.bam')
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath],
                                stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        ## sort by read names
        bam_filtered_sorted_fpath = os.path.join(output_dirpath, ref_name + '.filtered.sorted.bam')
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_filtered_sorted_fpath,
                                '-n', bam_filtered_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
        bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe')
        qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'),
                               stderr=open(err_path, 'a'), logger=logger)
        raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed')
        with open(bedpe_fpath, 'r') as bedpe:
            with open(raw_bed_fpath, 'w') as bed_file:
                for line in bedpe:
                    fs = line.split()
                    bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n']))
        sorted_bed_fpath = os.path.join(output_dirpath, ref_name + '.sorted.bed')
        qutils.call_subprocess([bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath],
                               stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i', sorted_bed_fpath, '-g', chr_len_fpath],
                               stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
    return raw_cov_fpath
Example #5
0
def draw_mummer_plot(logger, nucmer_fpath, delta_fpath, index, log_out_f, log_err_f):
    output_dirpath = dirname(dirname(nucmer_fpath))
    mummer_plot_fpath = join(output_dirpath, basename(nucmer_fpath) + '_mummerplot.html')
    return_code = qutils.call_subprocess(
        [bin_fpath('mummerplot'), '--html', '--layout', '-p', nucmer_fpath, delta_fpath],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(index))
    if return_code == 0:
        plot_script_fpath = nucmer_fpath + '.gp'
        temp_plot_fpath = nucmer_fpath + '.html'
        if isfile(plot_script_fpath) and isfile(gnuplot_exec_fpath()):
            qutils.call_subprocess(
                [gnuplot_exec_fpath(), plot_script_fpath],
                stdout=open('/dev/null', 'w'), stderr=log_err_f,
                indent='  ' + qutils.index_to_str(index))
            if isfile(temp_plot_fpath):
                with open(temp_plot_fpath) as template_file:
                    html = template_file.read()
                    html = _embed_css_and_scripts(html)
                    with open(mummer_plot_fpath, 'w') as f_html:
                        f_html.write(html)
                    logger.info('  ' + qutils.index_to_str(index) + 'MUMmer plot saved to ' + mummer_plot_fpath)

    if not isfile(mummer_plot_fpath):
        logger.notice(qutils.index_to_str(index) + ' MUMmer plot cannot be created.\n')
Example #6
0
def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, cov_fpath, chr_len_fpath):
    if not os.path.exists(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    raw_cov_fpath = cov_fpath + '_raw'
    if not is_non_empty_file(raw_cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = os.path.join(output_dirpath, ref_name + '.filtered.bam')
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath],
                                stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        ## sort by read names
        bam_filtered_sorted_fpath = os.path.join(output_dirpath, ref_name + '.filtered.sorted.bam')
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_filtered_sorted_fpath,
                                '-n', bam_filtered_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
        bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe')
        qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'),
                               stderr=open(err_path, 'a'), logger=logger)
        raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed')
        with open(bedpe_fpath, 'r') as bedpe:
            with open(raw_bed_fpath, 'w') as bed_file:
                for line in bedpe:
                    fs = line.split()
                    bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n']))
        sorted_bed_fpath = os.path.join(output_dirpath, ref_name + '.sorted.bed')
        qutils.call_subprocess([bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath],
                               stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i', sorted_bed_fpath, '-g', chr_len_fpath],
                               stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
    return raw_cov_fpath
Example #7
0
def draw_mummer_plot(logger, nucmer_fpath, delta_fpath, index, log_out_f, log_err_f):
    output_dirpath = dirname(dirname(nucmer_fpath))
    mummer_plot_fpath = join(output_dirpath, basename(nucmer_fpath) + '_mummerplot.html')
    return_code = qutils.call_subprocess(
        [bin_fpath('mummerplot'), '--html', '--layout', '-p', nucmer_fpath, delta_fpath],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(index))
    if return_code == 0:
        plot_script_fpath = nucmer_fpath + '.gp'
        temp_plot_fpath = nucmer_fpath + '.html'
        if isfile(plot_script_fpath) and isfile(gnuplot_exec_fpath()):
            qutils.call_subprocess(
                [gnuplot_exec_fpath(), plot_script_fpath],
                stdout=open('/dev/null', 'w'), stderr=log_err_f,
                indent='  ' + qutils.index_to_str(index))
            if isfile(temp_plot_fpath):
                with open(temp_plot_fpath) as template_file:
                    html = template_file.read()
                    html = _embed_css_and_scripts(html)
                    with open(mummer_plot_fpath, 'w') as f_html:
                        f_html.write(html)
                    logger.info('  ' + qutils.index_to_str(index) + 'MUMmer plot saved to ' + mummer_plot_fpath)

    if not isfile(mummer_plot_fpath):
        logger.notice(qutils.index_to_str(index) + ' MUMmer plot cannot be created.\n')
Example #8
0
def bwa_index(ref_fpath, err_path, logger):
    cmd = [bwa_fpath('bwa'), 'index', '-p', ref_fpath, ref_fpath]
    if getsize(ref_fpath) > 2 * 1024**3:  # if reference size bigger than 2GB
        cmd += ['-a', 'bwtsw']
    if not is_non_empty_file(ref_fpath + '.bwt'):
        qutils.call_subprocess(cmd,
                               stdout=open(err_path, 'a'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
Example #9
0
def sort_bam(bam_fpath, sorted_bam_fpath, err_path, logger, threads=None, sort_rule=None):
    if not threads:
        threads = qconfig.max_threads
    mem = '%dGB' % min(100, max(2, get_total_memory() // 4))
    cmd = [sambamba_fpath('sambamba'), 'sort', '-t', str(threads), '--tmpdir', dirname(sorted_bam_fpath), '-m', mem,
           '-o', sorted_bam_fpath, bam_fpath]
    if sort_rule:
        cmd += [sort_rule]
    qutils.call_subprocess(cmd, stderr=open(err_path, 'a'), logger=logger)
def check_repeats_instances(coords_fpath, repeats_fpath, use_long_reads=False):
    query_instances = defaultdict(list)
    with open(coords_fpath) as f:
        for line in f:
            fs = line.split('\t')
            contig, align_start, align_end, strand, ref_name, ref_start = \
                fs[0], fs[2], fs[3], fs[4], fs[5], fs[7]
            align_start, align_end, ref_start = map(int, (align_start, align_end, ref_start))
            align_start += 1
            ref_start += 1
            matched_bases, bases_in_mapping = map(int, (fs[9], fs[10]))
            if matched_bases > qconfig.optimal_assembly_insert_size:
                query_instances[contig].append((align_start, align_end))
    repeats_regions = defaultdict(list)
    filtered_repeats_fpath = add_suffix(repeats_fpath, 'filtered')
    with open(filtered_repeats_fpath, 'w') as out_f:
        with open(repeats_fpath) as f:
            for line in f:
                fs = line.split()
                query_id = '%s:%s-%s' % (fs[0], fs[1], fs[2])
                if query_id in query_instances and len(query_instances[query_id]) > 1:
                    mapped_repeats = sorted(list(set(query_instances[query_id][1:])))
                    merged_intervals = []
                    i_start, i_end = mapped_repeats[0]
                    merged_interval = (i_start, i_end)
                    for s, e in mapped_repeats[1:]:
                        if s <= merged_interval[1]:
                            merged_interval = (merged_interval[0], max(merged_interval[1], e))
                        else:
                            merged_intervals.append(merged_interval)
                            merged_interval = (s, e)
                    merged_intervals.append(merged_interval)
                    aligned_bases = sum([end - start + 1 for start, end in merged_intervals])
                    if aligned_bases >= (int(fs[2]) - int(fs[1])) * 0.9:
                        if use_long_reads and len(mapped_repeats) > 1:
                            solid_repeats = []
                            full_repeat_pos = int(fs[1])
                            mapped_repeats.sort(key=lambda x: (x[1], x[1] - x[0]), reverse=True)
                            cur_repeat_start, cur_repeat_end = mapped_repeats[0]
                            for repeat_start, repeat_end in mapped_repeats[1:]:
                                if (cur_repeat_start >= repeat_start - REPEAT_CONF_INTERVAL and cur_repeat_end <= repeat_end + REPEAT_CONF_INTERVAL) or \
                                        (repeat_start >= cur_repeat_start - REPEAT_CONF_INTERVAL and repeat_end <= cur_repeat_end + REPEAT_CONF_INTERVAL):
                                    cur_repeat_start, cur_repeat_end = min(repeat_start, cur_repeat_start), max(repeat_end, cur_repeat_end)
                                else:
                                    solid_repeats.append((cur_repeat_start, cur_repeat_end))
                                    cur_repeat_start, cur_repeat_end = repeat_start, repeat_end
                            solid_repeats.append((cur_repeat_start, cur_repeat_end))
                            for repeat in solid_repeats:
                                out_f.write('\t'.join((fs[0], str(repeat[0] + full_repeat_pos), str(repeat[1] + full_repeat_pos))) + '\n')
                                repeats_regions[fs[0]].append((repeat[0] + full_repeat_pos, repeat[1] + full_repeat_pos))
                        else:
                            out_f.write(line)
                            repeats_regions[fs[0]].append((int(fs[1]), int(fs[2])))
    sorted_repeats_fpath = add_suffix(repeats_fpath, 'sorted')
    qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', filtered_repeats_fpath],
                           stdout=open(sorted_repeats_fpath, 'w'), logger=logger)
    return sorted_repeats_fpath, repeats_regions
Example #11
0
def sambamba_view(in_fpath, out_fpath, max_threads, err_fpath, logger, filter_rule=None):
    cmd = [sambamba_fpath('sambamba'), 'view', '-t', str(max_threads), '-h']
    if in_fpath.endswith('.sam'):
        cmd += ['-S']
    if out_fpath.endswith('.bam'):
        cmd += ['-f', 'bam']
    if filter_rule:
        cmd += ['-F', filter_rule]
    cmd.append(in_fpath)
    qutils.call_subprocess(cmd, stdout=open(out_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
Example #12
0
def run(contigs_fpath, gff_fpath, log_fpath, threads, kingdom):
    barrnap_fpath = join(qconfig.LIBS_LOCATION, 'barrnap', 'bin', 'barrnap')
    if is_non_empty_file(gff_fpath):
        return
    call_subprocess([
        barrnap_fpath, '--quiet', '-k', kingdom, '--threads',
        str(threads), contigs_fpath
    ],
                    stdout=open(gff_fpath, 'w'),
                    stderr=open(log_fpath, 'a'))
Example #13
0
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath):
    tmp_bam_fpaths = []
    for tmp_sam_fpath in tmp_sam_fpaths:
        if is_non_empty_file(tmp_sam_fpath):
            tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam')
            tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted')
            if not is_non_empty_file(tmp_bam_sorted_fpath):
                sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger)
            tmp_bam_fpaths.append(tmp_bam_sorted_fpath)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), bam_fpath] + tmp_bam_fpaths,
                           stderr=open(err_fpath, 'a'), logger=logger)
    sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger)
    return sam_fpath
Example #14
0
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath,
                                 err_path, reads_fpaths):
    correct_chr_names = dict()
    ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath)
    sam_chr_lengths = dict()
    sam_header_fpath = os.path.join(output_dirpath,
                                    os.path.basename(sam_fpath) + '.header')
    qutils.call_subprocess(
        [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
        stdout=open(sam_header_fpath, 'w'),
        stderr=open(err_path, 'w'),
        logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(ref_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for ref_chr, sam_chr in zip(ref_chr_lengths.keys(),
                                    sam_chr_lengths.keys()):
            if correct_name(
                    sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[
                        sam_chr] == ref_chr_lengths[ref_chr]:
                correct_chr_names[sam_chr] = ref_chr
            elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(
                inconsistency + ' in reference and SAM file do not match. ' +
                'QUAST will try to realign reads to the reference genome.')
        else:
            logger.error(
                inconsistency + ' in reference and SAM file do not match. ' +
                'Use SAM file obtained by aligning reads to the reference genome.'
            )
        return None
    return correct_chr_names
Example #15
0
def align_kmers(output_dir, ref_fpath, kmers_fpath, log_err_fpath, max_threads):
    out_fpath = join(output_dir, 'kmers.coords')
    cmdline = [minimap_fpath(), '-ax', 'sr', '-s202', '--frag=no', '-t', str(max_threads), ref_fpath, kmers_fpath]
    qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent='  ')
    kmers_pos_by_chrom = defaultdict(list)
    kmers_by_chrom = defaultdict(list)
    with open(out_fpath) as f:
        for line in f:
            fs = line.split('\t')
            if len(fs) < 10:
                continue
            contig, chrom, pos = fs[0], fs[2], fs[3]
            kmers_pos_by_chrom[chrom].append(int(pos))
            kmers_by_chrom[chrom].append(int(contig))
    return kmers_by_chrom, kmers_pos_by_chrom
Example #16
0
def connect_with_matepairs(bam_fpath, output_dirpath, err_fpath):
    bam_filtered_fpath = add_suffix(bam_fpath, 'filtered')
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                            '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath],
                           stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
    ## sort by read names
    bam_filtered_sorted_fpath = add_suffix(bam_filtered_fpath, 'sorted')
    sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
    bed_fpath = bam_to_bed(output_dirpath, 'matepairs', bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True, only_intervals=True)
    matepair_regions = defaultdict(list)
    with open(bed_fpath) as bed:
        for l in bed:
            fs = l.split()
            matepair_regions[fs[0]].append((int(fs[1]), int(fs[2])))
    return matepair_regions
Example #17
0
def do(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, gc_fpath,
       features_containers, cov_fpath, output_dir, logger):
    if not exists(output_dir):
        os.makedirs(output_dir)
    conf_fpath, circos_legend_fpath = create_conf(ref_fpath, contigs_fpaths,
                                                  contig_report_fpath_pattern,
                                                  output_dir, gc_fpath,
                                                  features_containers,
                                                  cov_fpath, logger)
    circos_exec = get_path_to_program('circos')
    if not circos_exec:
        logger.warning(
            'Circos is not installed!\n'
            'If you want to create Circos plots, install Circos as described at http://circos.ca/tutorials/lessons/configuration/distribution_and_installation '
            'and run the following command:\n\tcircos -conf ' + conf_fpath +
            '\n'
            'The plot legend is saved to ' + circos_legend_fpath + '\n')
        return None, None

    cmdline = [circos_exec, '-conf', conf_fpath]
    log_fpath = join(output_dir, 'circos.log')
    err_fpath = join(output_dir, 'circos.err')
    circos_png_fpath = join(output_dir, circos_png_fname)
    return_code = qutils.call_subprocess(cmdline,
                                         stdout=open(log_fpath, 'w'),
                                         stderr=open(err_fpath, 'w'))
    if return_code == 0 and is_non_empty_file(circos_png_fpath):
        return circos_png_fpath, circos_legend_fpath
    else:
        logger.warning('  Circos diagram was not created. See ' + log_fpath +
                       ' and ' + err_fpath + ' for details')
        return None, None
Example #18
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + corr_assembly_label + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess(
        ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference,
         contigs_fpath, tmp_dir, str(qconfig.min_contig)],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(i),
        only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
Example #19
0
def compile_gage(only_clean=False):
    if only_clean:
        for required_name in required_java_fnames:
            fpath = os.path.join(gage_dirpath, required_name + '.class')
            if os.path.isfile(fpath):
                os.remove(fpath)
        return True

    javac_path = get_path_to_program('javac')
    if javac_path is None:
        logger.error('Java compiler not found (javac)! '
                     'Please install it or compile GAGE java classes manually (' + gage_dirpath + '/*.java)!')
        return False

    cur_dir = os.getcwd()
    os.chdir(gage_dirpath)
    # making
    logger.main_info('Compiling JAVA classes (details are in ' + os.path.join(gage_dirpath, 'make.log') + ' and make.err)')
    return_codes = [qutils.call_subprocess(
        ['javac', os.path.join(gage_dirpath, java_fname + '.java')],
        stdout=open(os.path.join(gage_dirpath, 'make.log'), 'w'),
        stderr=open(os.path.join(gage_dirpath, 'make.err'), 'w'),) for java_fname in required_java_fnames]
    os.chdir(cur_dir)

    if any(return_code != 0 for return_code in return_codes) or not all_required_java_classes_exist(gage_dirpath):
        logger.error('Error occurred during compilation of java classes (' + gage_dirpath + '/*.java)! '
                     'Try to compile it manually. ' + ('You can restart Quast with the --debug flag '
                     'to see the command line.' if not qconfig.debug else ''))
        return False
    return True
Example #20
0
def run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads):  # run minimap2 for AGB
    mask_level = '1' if qconfig.min_IDY < 95 else '0.9'
    cmdline = [minimap_fpath(), '-cx', 'asm20', '--mask-level', mask_level, '-N', '100',
               '--score-N', '0', '-E', '1,0', '-f', '200', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath]
    return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'),
                                         indent='  ' + qutils.index_to_str(index))
    return return_code
Example #21
0
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath,
          num_threads):
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl')
    libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib')
    err_file = open(err_fpath, 'w')
    tmp_dirpath += qutils.name_from_fpath(fasta_fpath)
    if not os.path.isdir(tmp_dirpath):
        os.mkdir(tmp_dirpath)
    return_code = qutils.call_subprocess([
        'perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores',
        str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath
    ] + (['--fungus'] if qconfig.is_fungus else []),
                                         stdout=err_file,
                                         stderr=err_file,
                                         indent='    ' +
                                         qutils.index_to_str(index))
    if return_code != 0:
        return
    genes = []
    fnames = [
        fname for (path, dirs, files) in os.walk(tmp_dirpath)
        for fname in files
    ]
    for fname in fnames:
        if fname.endswith('gtf'):
            genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname)))
    return genes
Example #22
0
def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index,
                max_threads):
    if qconfig.is_agv_mode:
        return run_minimap_agv(out_fpath, ref_fpath, contigs_fpath,
                               log_err_fpath, index, max_threads)

    preset = 'asm5' if qconfig.min_IDY >= 95 and not qconfig.is_combined_ref else 'asm10'
    # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty
    # -O -- gap penalty, -r -- max gap size
    mask_level = '1' if qconfig.is_combined_ref else '0.9'
    num_alignments = '100' if qconfig.is_combined_ref else '50'
    additional_options = [
        '-B5', '-O4,16', '--no-long-join', '-r',
        str(qconfig.MAX_INDEL_LENGTH), '-N', num_alignments, '-s',
        str(qconfig.min_alignment), '-z', '200'
    ]
    cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \
              ['--mask-level', mask_level, '--min-occ', '200', '-g', '2500', '--score-N', '2', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath]
    return_code = qutils.call_subprocess(cmdline,
                                         stdout=open(out_fpath, 'w'),
                                         stderr=open(log_err_fpath, 'a'),
                                         indent='  ' +
                                         qutils.index_to_str(index))

    return return_code
Example #23
0
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads):
    tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath)

    tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl')
    err_file = open(err_fpath, 'w')
    fasta_name = qutils.name_from_fpath(fasta_fpath)
    return_code = qutils.call_subprocess(
        ['perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath,
         fasta_fpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))
    if return_code != 0:
        return

    genes = []
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp')
    sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name)
    out_fpath = sub_fasta_fpath + '.gmhmm'
    heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod')
    with open(err_fpath, 'a') as err_file:
        ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath,
                   out_fpath, err_file, index)
        if ok:
            genes.extend(parse_gmhmm_out(out_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    return genes
Example #24
0
def run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads):  # run minimap2 for AGB
    mask_level = '1' if qconfig.min_IDY < 95 else '0.9'
    cmdline = [minimap_fpath(), '-cx', 'asm20', '--mask-level', mask_level, '-N', '100',
               '--score-N', '0', '-E', '1,0', '-f', '200', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath]
    return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'),
                                         indent='  ' + qutils.index_to_str(index))
    return return_code
Example #25
0
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath,
                    num_threads):
    tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath)

    tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl')
    err_file = open(err_fpath, 'w')
    fasta_name = qutils.name_from_fpath(fasta_fpath)
    return_code = qutils.call_subprocess([
        'perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out',
        tmp_dirpath, fasta_fpath
    ],
                                         stdout=err_file,
                                         stderr=err_file,
                                         indent='    ' +
                                         qutils.index_to_str(index))
    if return_code != 0:
        return

    genes = []
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp')
    sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name)
    out_fpath = sub_fasta_fpath + '.gmhmm'
    heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod')
    with open(err_fpath, 'a') as err_file:
        ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath,
                     err_file, index)
        if ok:
            genes.extend(parse_gmhmm_out(out_fpath))

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)

    return genes
Example #26
0
def compile_glimmer(logger, only_clean=False):
    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tool_src_dirpath = os.path.join(tool_dirpath, 'src')
    tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm')

    if only_clean:
        if os.path.isfile(tool_exec_fpath):
            os.remove(tool_exec_fpath)
        return True

    if not os.path.isfile(tool_exec_fpath):
        logger.main_info("Compiling GlimmerHMM...")
        return_code = qutils.call_subprocess(
            ['make', '-C', tool_src_dirpath],
            stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'),
            stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'),
            indent='    ')
        if return_code != 0 or not os.path.isfile(tool_exec_fpath):
            logger.error(
                "Failed to compile GlimmerHMM (" + tool_src_dirpath +
                ")!\nTry to compile it manually or do not use --gene-finding "
                "option with --eukaryote.\nUse --debug option to see the command lines."
            )
            return None
    return tool_exec_fpath
Example #27
0
def compile_gage(only_clean=False):
    if only_clean:
        for required_name in required_java_fnames:
            fpath = os.path.join(gage_dirpath, required_name + '.class')
            if os.path.isfile(fpath):
                os.remove(fpath)
        return True

    javac_path = get_path_to_program('javac')
    if javac_path is None:
        logger.error('Java compiler not found (javac)! '
                     'Please install it or compile GAGE java classes manually (' + gage_dirpath + '/*.java)!')
        return

    cur_dir = os.getcwd()
    os.chdir(gage_dirpath)
    # making
    logger.main_info('Compiling JAVA classes (details are in ' + os.path.join(gage_dirpath, 'make.log') + ' and make.err)')
    return_codes = [qutils.call_subprocess(
        ['javac', os.path.join(gage_dirpath, java_fname + '.java')],
        stdout=open(os.path.join(gage_dirpath, 'make.log'), 'w'),
        stderr=open(os.path.join(gage_dirpath, 'make.err'), 'w'),) for java_fname in required_java_fnames]
    os.chdir(cur_dir)

    if any(return_code != 0 for return_code in return_codes) or not all_required_java_classes_exist(gage_dirpath):
        logger.error('Error occurred during compilation of java classes (' + gage_dirpath + '/*.java)! '
                     'Try to compile it manually. ' + ('You can restart Quast with the --debug flag '
                     'to see the command line.' if not qconfig.debug else ''))
        return
Example #28
0
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir):
    assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(i) + assembly_label + '...')

    # run gage tool
    log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stdout')
    log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stderr')
    logger.info('  ' + qutils.index_to_str(i) + 'Logging to files ' +
                os.path.basename(log_out_fpath) + ' and ' +
                os.path.basename(log_err_fpath) + '...')
    log_out_f = open(log_out_fpath, 'w')
    log_err_f = open(log_err_fpath, 'w')

    return_code = qutils.call_subprocess(
        ['sh', gage_tool_path, abspath(ca_utils.misc.contig_aligner_dirpath), reference,
         contigs_fpath, tmp_dir, str(qconfig.min_contig)],
        stdout=log_out_f,
        stderr=log_err_f,
        indent='  ' + qutils.index_to_str(i),
        only_if_debug=False)
    if return_code != 0:
        logger.info('  ' + qutils.index_to_str(i) + 'Failed.')
    else:
        logger.info('  ' + qutils.index_to_str(i) + 'Done.')

    log_out_f.close()
    log_err_f.close()

    return return_code
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False):
    red_genome_dir = os.path.join(tmp_dir, 'tmp_red')
    if isdir(red_genome_dir):
        shutil.rmtree(red_genome_dir)
    os.makedirs(red_genome_dir)

    ref_name = qutils.name_from_fpath(ref_fpath)
    ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa')  ## Red recognizes only *.fa files
    if os.path.islink(ref_symlink):
        os.remove(ref_symlink)
    os.symlink(ref_fpath, ref_symlink)

    logger.info('  ' + 'Running repeat masking tool...')
    repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt')
    if is_non_empty_file(repeats_fpath):
        return_code = 0
        logger.info('  ' + 'Using existing file ' + repeats_fpath + '...')
    else:
        return_code = qutils.call_subprocess([binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5'],
                                             stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent='    ')
    if return_code == 0 and repeats_fpath and exists(repeats_fpath):
        long_repeats_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt')
        with open(long_repeats_fpath, 'w') as out:
            with open(repeats_fpath) as in_f:
                for line in in_f:
                    l = line.split('\t')
                    repeat_len = int(l[2]) - int(l[1])
                    if repeat_len >= insert_size:
                        out.write(line[1:])

        repeats_fasta_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta')
        coords_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt')
        if not is_non_empty_file(coords_fpath):
            fasta_index_fpath = ref_fpath + '.fai'
            if exists(fasta_index_fpath):
                os.remove(fasta_index_fpath)
            qutils.call_subprocess([bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed',
                                    long_repeats_fpath, '-fo', repeats_fasta_fpath],
                                    stderr=open(log_fpath, 'w'), indent='    ')
            cmdline = [minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100',
                       '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath]
            qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a'))
        filtered_repeats_fpath, repeats_regions = check_repeats_instances(coords_fpath, long_repeats_fpath, use_long_reads)
        unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath)
        return unique_covered_regions, repeats_regions
    return None, None
Example #30
0
def sort_bam(bam_fpath,
             sorted_bam_fpath,
             err_path,
             logger,
             threads=None,
             sort_rule=None):
    if not threads:
        threads = qconfig.max_threads
    mem = '%dGB' % min(100, max(2, get_free_memory()))
    cmd = [
        sambamba_fpath('sambamba'), 'sort', '-t',
        str(threads), '--tmpdir',
        dirname(sorted_bam_fpath), '-m', mem, '-o', sorted_bam_fpath, bam_fpath
    ]
    if sort_rule:
        cmd += [sort_rule]
    qutils.call_subprocess(cmd, stderr=open(err_path, 'a'), logger=logger)
Example #31
0
def calculate_genome_cov(in_fpath,
                         out_fpath,
                         chr_len_fpath,
                         err_fpath,
                         logger,
                         print_all_positions=True):
    cmd = [
        bedtools_fpath('bedtools'), 'genomecov',
        '-ibam' if in_fpath.endswith('.bam') else '-i', in_fpath, '-g',
        chr_len_fpath
    ]
    if print_all_positions:
        cmd += ['-bga']
    qutils.call_subprocess(cmd,
                           stdout=open(out_fpath, 'w'),
                           stderr=open(err_fpath, 'a'),
                           logger=logger)
Example #32
0
 def run(contig_path, tmp_path):
     with open(err_path, 'a') as err_file:
         return_code = qutils.call_subprocess(
             [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
             stdout=err_file,
             stderr=err_file,
             indent='  ' + qutils.index_to_str(index) + '  ')
         return return_code
 def run(contig_path, tmp_path):
     with open(err_path, 'a') as err_file:
         return_code = qutils.call_subprocess(
             [tool_exec_fpath, contig_path, '-d', trained_dir, '-g', '-o', tmp_path],
             stdout=err_file,
             stderr=err_file,
             indent='  ' + qutils.index_to_str(index) + '  ')
         return return_code
Example #34
0
def parallel_blast(contigs_fpath, label, blast_res_fpath, err_fpath,
                   blast_check_fpath, blast_threads):
    cmd = get_blast_fpath('blastn') + (
        ' -query %s -db %s -outfmt 7 -num_threads %s' %
        (contigs_fpath, db_fpath, blast_threads))
    res_fpath = blast_res_fpath + '_' + label
    check_fpath = blast_check_fpath + '_' + label
    logger.info('  ' + 'processing ' + label)
    qutils.call_subprocess(shlex.split(cmd),
                           stdout=open(res_fpath, 'w'),
                           stderr=open(err_fpath, 'a'),
                           logger=logger)
    logger.info('  ' + 'BLAST results for %s are saved to %s...' %
                (label, res_fpath))
    with open(check_fpath, 'w') as check_file:
        check_file.writelines('Assembly: %s size: %d\n' %
                              (contigs_fpath, os.path.getsize(contigs_fpath)))
    return
Example #35
0
def sambamba_view(in_fpath,
                  out_fpath,
                  max_threads,
                  err_fpath,
                  logger,
                  filter_rule=None):
    cmd = [sambamba_fpath('sambamba'), 'view', '-t', str(max_threads), '-h']
    if in_fpath.endswith('.sam'):
        cmd += ['-S']
    if out_fpath.endswith('.bam'):
        cmd += ['-f', 'bam']
    if filter_rule:
        cmd += ['-F', filter_rule]
    cmd.append(in_fpath)
    qutils.call_subprocess(cmd,
                           stdout=open(out_fpath, 'w'),
                           stderr=open(err_fpath, 'a'),
                           logger=logger)
Example #36
0
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names):
    raw_cov_fpath = cov_fpath + '_raw'
    chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names)
    if not is_non_empty_file(cov_fpath):
        logger.info('  Calculating reads coverage...')
        if not is_non_empty_file(raw_cov_fpath):
            if not is_non_empty_file(bam_sorted_fpath):
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath,
                                        bam_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
            qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam', bam_sorted_fpath, '-g', chr_len_fpath],
                                   stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
            qutils.assert_file_exists(raw_cov_fpath, 'coverage file')
        proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names)
    if not is_non_empty_file(physical_cov_fpath):
        raw_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path,
                                              physical_cov_fpath, chr_len_fpath)
        proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names)
    return cov_fpath, physical_cov_fpath
Example #37
0
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, output_dir, max_threads, err_fpath):
    merged_bam_fpath = add_suffix(bam_fpath, 'merged')
    tmp_bam_fpaths = []
    for tmp_sam_fpath in tmp_sam_fpaths:
        if is_non_empty_file(tmp_sam_fpath):
            tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam')
            tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted')
            if not is_non_empty_file(tmp_bam_sorted_fpath):
                sambamba_view(tmp_sam_fpath, tmp_bam_fpath, max_threads, err_fpath, logger, filter_rule=None)
                sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger)
            tmp_bam_fpaths.append(tmp_bam_sorted_fpath)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), merged_bam_fpath] + tmp_bam_fpaths,
                           stderr=open(err_fpath, 'a'), logger=logger)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir',
                            output_dir, merged_bam_fpath, bam_fpath],
                           stderr=open(err_fpath, 'a'), logger=logger)
    sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger)
    return merged_bam_fpath
Example #38
0
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, max_threads):
    nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster),
                      '-l', str(qconfig.min_cluster), '--maxmatch',
                      '-p', prefix, '-t', str(max_threads)]
    env = os.environ.copy()
    nucmer_cmdline += [ref_fpath, contigs_fpath]
    return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'),
                                         indent='  ' + qutils.index_to_str(index), env=env)

    return return_code
Example #39
0
def compile_gnuplot(logger, only_clean=False):
    tool_dirpath = join(qconfig.LIBS_LOCATION, 'gnuplot')
    tool_exec_fpath = gnuplot_exec_fpath()

    if only_clean:
        if isfile(tool_exec_fpath):
            os.remove(tool_exec_fpath)
        return True

    if not isfile(tool_exec_fpath):
        failed_compilation_flag = join(tool_dirpath, 'make.failed')
        if check_prev_compilation_failed('gnuplot',
                                         failed_compilation_flag,
                                         just_notice=True,
                                         logger=logger):
            return None
        logger.main_info("Compiling gnuplot...")
        prev_dir = os.getcwd()
        os.chdir(tool_dirpath)
        return_code = qutils.call_subprocess(
            [
                './configure', '--with-qt=no', '--disable-wxwidgets',
                '--prefix=' + tool_dirpath
            ],
            stdout=open(join(tool_dirpath, 'make.log'), 'w'),
            stderr=open(join(tool_dirpath, 'make.err'), 'w'),
            indent='    ')
        if return_code == 0:
            return_code = qutils.call_subprocess(
                ['make'],
                stdout=open(join(tool_dirpath, 'make.log'), 'w'),
                stderr=open(join(tool_dirpath, 'make.err'), 'w'),
                indent='    ')
        os.chdir(prev_dir)
        if return_code != 0 or not isfile(tool_exec_fpath):
            write_failed_compilation_flag('gnuplot',
                                          tool_dirpath,
                                          failed_compilation_flag,
                                          just_notice=True,
                                          logger=logger)
            return None
    return tool_exec_fpath
Example #40
0
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type):
    bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads)
    insert_sizes = []
    for idx, reads in enumerate(read_fpaths):
        if isinstance(reads, str):
            if reads_type == 'pacbio' or reads_type == 'nanopore':
                if reads_type == 'pacbio':
                    preset = ' -ax map-pb '
                else:
                    preset = ' -ax map-ont '
                cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads
            else:
                cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads
        else:
            read1, read2 = reads
            cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2
        output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1))
        bam_fpath = output_fpath.replace('.sam', '.bam')
        if not is_non_empty_file(output_fpath):
            qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        if not is_non_empty_file(bam_fpath):
            if not is_non_empty_file(bam_fpath):
                sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)
            if reads_type == 'pe':
                bam_dedup_fpath = add_suffix(bam_fpath, 'dedup')
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir',
                                        output_dir, bam_fpath, bam_dedup_fpath],
                                        stderr=open(err_fpath, 'a'), logger=logger)
                if exists(bam_dedup_fpath):
                    shutil.move(bam_dedup_fpath, bam_fpath)
        if reads_type == 'pe':
            insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath))
            if insert_size < qconfig.optimal_assembly_max_IS:
                insert_sizes.append(insert_size)
        out_sam_fpaths.append(output_fpath)

    if insert_sizes:
        qconfig.optimal_assembly_insert_size = max(insert_sizes)
        ref_name = qutils.name_from_fpath(ref_fpath)
        insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt')
        with open(insert_size_fpath, 'w') as out:
            out.write(str(qconfig.optimal_assembly_insert_size))
Example #41
0
def get_correct_names_for_chroms(output_dirpath, fasta_fpath, sam_fpath, err_path, reads_fpaths, logger, is_reference=False):
    correct_chr_names = dict()
    fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath)
    sam_chr_lengths = dict()
    sam_header_fpath = join(dirname(output_dirpath), basename(sam_fpath) + '.header')
    if not isfile(sam_fpath) and not isfile(sam_header_fpath):
        return None
    if isfile(sam_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
                               stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(fasta_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(), sam_chr_lengths.keys()):
            if correct_name(sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == fasta_chr_lengths[fasta_chr]:
                correct_chr_names[sam_chr] = fasta_chr
            elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' +
                           'QUAST will try to realign reads to ' + ('the reference genome' if is_reference else fasta_fpath))
        else:
            logger.error(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' +
                         'Use SAM file obtained by aligning reads to ' + ('the reference genome' if is_reference else fasta_fpath))
        return None
    return correct_chr_names
Example #42
0
def run_bwa(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type):
    bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads)
    insert_sizes = []
    for idx, reads in enumerate(read_fpaths):
        if isinstance(reads, str):
            cmd = bwa_cmd + (' -p ' if reads_type != 'single' else ' ') + ref_fpath + ' ' + reads
        else:
            read1, read2 = reads
            cmd = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2
        output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1))
        if not is_non_empty_file(output_fpath):
            qutils.call_subprocess(shlex.split(cmd), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
            if reads_type == 'paired_end':
                insert_size = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath))
                if insert_size < qconfig.ideal_assembly_max_IS:
                    insert_sizes.append(insert_size)
        out_sam_fpaths.append(output_fpath)

    if insert_sizes:
        qconfig.ideal_assembly_insert_size = max(insert_sizes)
Example #43
0
def calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath):
    if not reads_fpaths or not sam_fpath:
        return

    lap_out_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.lap.out')
    if not is_non_empty_file(lap_out_fpath):
        if index is not None:
            logger.info('  ' + index_str + 'Running LAP...')
        else:
            logger.info('  Running LAP for reference...')
        prob_out_fpath = get_safe_fpath(output_dirpath, filename + '.prob')
        qutils.call_subprocess([lap_fpath('calc_prob.py'), '-a', fpath, '-i', ','.join(reads_fpaths), '-q', '-s', sam_fpath],
                                stdout=open(prob_out_fpath, 'w'), stderr=open(err_fpath, 'a'))
        qutils.call_subprocess([lap_fpath('sum_prob.py'), '-i', prob_out_fpath],
                                stdout=open(lap_out_fpath, 'w'), stderr=open(err_fpath, 'a'))
    else:
        if index is not None:
            logger.info('  ' + index_str + 'Using existing file with LAP score...')
        else:
            logger.info('  Using existing file with LAP score for reference...')
Example #44
0
def gmhmm_p(tool_exec, fasta_fpath, heu_fpath, out_fpath, err_file, index):
    """ Run GeneMark.hmm with this heuristic model (heu_dirpath)
        prompt> gmhmmp -m heu_11_45.mod sequence
        prompt> gm -m heu_11_45.mat sequence"""
    return_code = qutils.call_subprocess(
        [tool_exec, '-d', '-a', '-p', '0', '-m', heu_fpath, '-o', out_fpath, fasta_fpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))

    return return_code == 0 and os.path.isfile(out_fpath)
Example #45
0
def parallel_blast(contigs_fpath, label, corrected_dirpath, err_fpath, blast_res_fpath, blast_check_fpath, blast_threads):
    logger.info('  ' + 'processing ' + label)
    blast_query_fpath = contigs_fpath
    compress_ext = ['.gz', '.gzip', '.bz2', '.bzip2', '.zip']
    if any(contigs_fpath.endswith(ext) for ext in compress_ext):
        logger.info('  ' + 'unpacking ' + label)
        unpacked_fpath = os.path.join(corrected_dirpath, os.path.basename(contigs_fpath) + '.unpacked')
        with _get_fasta_file_handler(contigs_fpath) as f_in:
            with open(unpacked_fpath, 'w') as f_out:
                for l in f_in:
                    f_out.write(l)
        blast_query_fpath = unpacked_fpath
    res_fpath = get_blast_output_fpath(blast_res_fpath, label)
    check_fpath = get_blast_output_fpath(blast_check_fpath, label)
    cmd = get_blast_fpath('blastn') + (' -query %s -db %s -outfmt 7 -num_threads %s' % (
        blast_query_fpath, db_fpath, blast_threads))
    qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
    logger.info('  ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath))
    with open(check_fpath, 'w') as check_file:
        check_file.writelines('Assembly: %s size: %d\n' % (contigs_fpath, os.path.getsize(contigs_fpath)))
Example #46
0
def gmhmm_p(tool_exec, fasta_fpath, heu_fpath, out_fpath, err_file, index):
    """ Run GeneMark.hmm with this heuristic model (heu_dirpath)
        prompt> gmhmmp -m heu_11_45.mod sequence
        prompt> gm -m heu_11_45.mat sequence"""
    return_code = qutils.call_subprocess(
        [tool_exec, '-d', '-a', '-p', '0', '-m', heu_fpath, '-o', out_fpath, fasta_fpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))

    return return_code == 0 and os.path.isfile(out_fpath)
Example #47
0
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, emem_threads=1):
    # additional GAGE params of Nucmer: '-l', '30', '-banded'
    nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster),
                      '-l', str(qconfig.min_cluster), '--maxmatch',
                      '-p', prefix]
    if is_emem_aligner():
        nucmer_cmdline += ['-t', str(emem_threads)]
    nucmer_cmdline += [ref_fpath, contigs_fpath]
    return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'),
                                         indent='  ' + qutils.index_to_str(index))

    return return_code
Example #48
0
def bam_to_bed(output_dirpath, name, bam_fpath, err_path, logger, bedpe=False):
    raw_bed_fpath = join(output_dirpath, name + '.bed')
    if bedpe:
        bedpe_fpath = join(output_dirpath, name + '.bedpe')
        if not is_non_empty_file(bedpe_fpath) and not is_non_empty_file(
                bedpe_fpath):
            qutils.call_subprocess(
                [bedtools_fpath('bamToBed'), '-i', bam_fpath, '-bedpe'],
                stdout=open(bedpe_fpath, 'w'),
                stderr=open(err_path, 'a'),
                logger=logger)
            with open(bedpe_fpath, 'r') as bedpe:
                with open(raw_bed_fpath, 'w') as bed_file:
                    for line in bedpe:
                        fs = line.split()
                        start, end = fs[1], fs[5]
                        bed_file.write('\t'.join([fs[0], start, end + '\n']))
    else:
        if not is_non_empty_file(raw_bed_fpath):
            qutils.call_subprocess(
                [bedtools_fpath('bamToBed'), '-i', bam_fpath],
                stdout=open(raw_bed_fpath, 'w'),
                stderr=open(err_path, 'a'),
                logger=logger)

    sorted_bed_fpath = join(output_dirpath, name + '.sorted.bed')
    if not is_non_empty_file(sorted_bed_fpath):
        qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', raw_bed_fpath],
                               stdout=open(sorted_bed_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
    return sorted_bed_fpath
Example #49
0
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath, err_path, reads_fpaths):
    correct_chr_names = dict()
    ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath)
    sam_chr_lengths = dict()
    sam_header_fpath = os.path.join(output_dirpath, os.path.basename(sam_fpath) + '.header')
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
                           stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'w'), logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(ref_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for ref_chr, sam_chr in zip(ref_chr_lengths.keys(), sam_chr_lengths.keys()):
            if correct_name(sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == ref_chr_lengths[ref_chr]:
                correct_chr_names[sam_chr] = ref_chr
            elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(inconsistency + ' in reference and SAM file do not match. ' +
                           'QUAST will try to realign reads to the reference genome.')
        else:
            logger.error(inconsistency + ' in reference and SAM file do not match. ' +
                         'Use SAM file obtained by aligning reads to the reference genome.')
        return None
    return correct_chr_names
Example #50
0
def run_nucmer(prefix, ref_fpath, contigs_fpath, log_out_fpath, log_err_fpath, index, emem_threads=1):
    # additional GAGE params of Nucmer: '-l', '30', '-banded'
    nucmer_cmdline = [bin_fpath('nucmer'), '-c', str(qconfig.min_cluster),
                      '-l', str(qconfig.min_cluster), '--maxmatch',
                      '-p', prefix]
    if is_emem_aligner():
        nucmer_cmdline += ['-t', str(emem_threads)]
        installed_emem_fpath = get_installed_emem()
        if installed_emem_fpath:
            nucmer_cmdline += ['--emem', installed_emem_fpath]

    nucmer_cmdline += [ref_fpath, contigs_fpath]
    return_code = qutils.call_subprocess(nucmer_cmdline, stdout=open(log_out_fpath, 'a'), stderr=open(log_err_fpath, 'a'),
                                         indent='  ' + qutils.index_to_str(index))

    return return_code
Example #51
0
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads):
    tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl')
    libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib')
    err_file = open(err_fpath, 'w')
    tmp_dirpath += qutils.name_from_fpath(fasta_fpath)
    if not os.path.isdir(tmp_dirpath):
        os.mkdir(tmp_dirpath)
    return_code = qutils.call_subprocess(
        ['perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath,
         '--out', tmp_dirpath],
        stdout=err_file,
        stderr=err_file,
        indent='    ' + qutils.index_to_str(index))
    if return_code != 0:
        return
    genes = []
    fnames = [fname for (path, dirs, files) in os.walk(tmp_dirpath) for fname in files]
    for fname in fnames:
        if fname.endswith('gtf'):
            genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname)))
    return genes
Example #52
0
def do(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, gc_fpath, features_containers, cov_fpath, output_dir, logger):
    if not exists(output_dir):
        os.makedirs(output_dir)
    conf_fpath, circos_legend_fpath = create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger)
    circos_exec = get_path_to_program('circos')
    if not circos_exec:
        logger.warning('Circos is not installed!\n'
                       'If you want to create Circos plots, install Circos as described at http://circos.ca/tutorials/lessons/configuration/distribution_and_installation '
                       'and run the following command:\n circos -conf ' + conf_fpath + '.\n '
                       'The plot annotation is saved to ' + circos_legend_fpath)
        return None, None

    cmdline = [circos_exec, '-conf', conf_fpath]
    log_fpath = join(output_dir, 'circos.log')
    err_fpath = join(output_dir, 'circos.err')
    circos_png_fpath = join(output_dir, circos_png_fname)
    return_code = qutils.call_subprocess(cmdline, stdout=open(log_fpath, 'w'), stderr=open(err_fpath, 'w'))
    if return_code == 0 and is_non_empty_file(circos_png_fpath):
        return circos_png_fpath, circos_legend_fpath
    else:
        logger.warning('  Circos diagram was not created. See ' + log_fpath + ' and ' + err_fpath + ' for details')
        return None, None
Example #53
0
def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads):
    if qconfig.is_agb_mode:
        return run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads)

    if qconfig.min_IDY < 90:
        preset = 'asm20'
    elif qconfig.min_IDY < 95:
        preset = 'asm10'
    else:
        preset = 'asm5'
    # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty
    # -O -- gap penalty, -r -- max gap size
    mask_level = '1' if qconfig.is_combined_ref else '0.9'
    num_alignments = '100' if qconfig.is_combined_ref else '50'
    additional_options = ['-B5', '-O4,16', '--no-long-join', '-r', str(qconfig.MAX_INDEL_LENGTH),
                          '-N', num_alignments, '-s', str(qconfig.min_alignment), '-z', '200']
    cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \
              ['--mask-level', mask_level, '--min-occ', '200', '-g', '2500', '--score-N', '2', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath]
    return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'),
                                         indent='  ' + qutils.index_to_str(index))

    return return_code
Example #54
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None):
    ref_name = qutils.name_from_fpath(cur_ref_fpath)
    if not bam_fpath:
        sam_fpath = join(output_dirpath, ref_name + '.sam')
        bam_fpath = join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam')
    else:
        sam_fpath = bam_fpath.replace('.bam', '.sam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed')
    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    if not isfile(bam_sorted_fpath):
        sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
        sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads)
    if not is_non_empty_file(bam_sorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath],
                               stderr=open(err_fpath, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss')
    vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf')
    if not is_non_empty_file(vcf_fpath):
        if isdir(vcf_output_dirpath):
            shutil.rmtree(vcf_output_dirpath, ignore_errors=True)
        os.makedirs(vcf_output_dirpath)
        max_mem = get_gridss_memory()
        env = os.environ.copy()
        env["PATH"] += os.pathsep + bwa_dirpath
        bwa_index(cur_ref_fpath, err_fpath, logger)
        qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true',
                                '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true',
                                '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath,
                                'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath,
                                'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath],
                                stderr=open(err_fpath, 'a'), logger=logger, env=env)
    if is_non_empty_file(vcf_fpath):
        raw_bed_fpath = add_suffix(bed_fpath, 'raw')
        filtered_bed_fpath = add_suffix(bed_fpath, 'filtered')
        qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe',
                                'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath,
                                'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger)
        reformat_bedpe(raw_bed_fpath, bed_fpath)
    return bed_fpath
Example #55
0
def compile_glimmer(logger, only_clean=False):
    tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer')
    tool_src_dirpath = os.path.join(tool_dirpath, 'src')
    tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm')

    if only_clean:
        if os.path.isfile(tool_exec_fpath):
            os.remove(tool_exec_fpath)
        return True

    if not os.path.isfile(tool_exec_fpath):
        logger.main_info("Compiling GlimmerHMM...")
        return_code = qutils.call_subprocess(
            ['make', '-C', tool_src_dirpath],
            stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'),
            stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'),
            indent='    ')
        if return_code != 0 or not os.path.isfile(tool_exec_fpath):
            logger.error("Failed to compile GlimmerHMM (" + tool_src_dirpath +
                         ")!\nTry to compile it manually or do not use --gene-finding "
                         "option with --eukaryote.\nUse --debug option to see the command lines.")
            return None
    return tool_exec_fpath
Example #56
0
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath):
    sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam')
    bam_fpath = sam_fpath.replace('.sam', '.bam')
    bam_mapped_fpath = add_suffix(bam_fpath, 'mapped')
    bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    if not is_non_empty_file(bam_fpath):
        bwa_index(ref_fpath, err_fpath, logger)
        qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath],
                               stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
    if not is_non_empty_file(bam_sorted_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-F', 'not unmapped', bam_fpath],
                               stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
    cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    ref_name = qutils.name_from_fpath(ref_fpath)
    correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger)
    get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath,
                 correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return uncovered_fpath
Example #57
0
def bam_to_bed(output_dirpath, name, bam_fpath, err_path, logger, bedpe=False, only_intervals=False):
    raw_bed_fpath = join(output_dirpath, name + '.bed')
    if bedpe:
        bedpe_fpath = join(output_dirpath, name + '.bedpe')
        qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_fpath, '-bedpe'],
                               stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
        with open(bedpe_fpath, 'r') as bedpe:
            with open(raw_bed_fpath, 'w') as bed_file:
                for line in bedpe:
                    fs = line.split()
                    if only_intervals:
                        start, end = fs[2], fs[4]
                    else:
                        start, end = fs[1], fs[5]
                    bed_file.write('\t'.join([fs[0], start, end + '\n']))
    else:
        qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_fpath],
                               stdout=open(raw_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)

    sorted_bed_fpath = join(output_dirpath, name + '.sorted.bed')
    qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', raw_bed_fpath],
                           stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
    return sorted_bed_fpath
Example #58
0
def create_krona_charts(taxons_for_krona, meta_log, results_dirpath, json_texts):
    meta_log.info('  Drawing interactive Krona plots...')
    krona_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'kronatools')
    krona_res_dirpath = os.path.join(results_dirpath, qconfig.krona_dirname)
    try:
        import json
    except ImportError:
        try:
            import simplejson as json
        except ImportError:
            meta_log.warning('Can\'t draw Krona charts - please install python-simplejson')
            return
    if not os.path.isdir(krona_res_dirpath):
        os.mkdir(krona_res_dirpath)
    json_data = json.loads(json_texts[0])
    assemblies = json_data['assembliesNames']
    krona_txt_ext = '_taxonomy.txt'
    krona_common_fpath = os.path.join(krona_res_dirpath, 'overall' + krona_txt_ext)
    krona_common_file = open(krona_common_fpath, 'w')
    for index, name in enumerate(assemblies):
        krona_file = open(os.path.join(krona_res_dirpath, name + krona_txt_ext), 'w')
        krona_file.close()
    for json_text in json_texts[1:]:
        json_data = json.loads(json_text)
        ref_name = json_data['referenceName']
        if not ref_name:
            continue
        lengths = []
        report = json_data['report']
        for section in report:
            if lengths:
                break
            for metric in section[1]:
                if metric['metricName'] == reporting.Fields.TOTAL_ALIGNED_LEN:
                    lengths = metric['values']
                    break
        if not lengths:
            continue
        if None in lengths:
            lengths = [l if l is not None else 0 for l in lengths]
        cur_assemblies = json_data['assembliesNames']
        for index, name in enumerate(cur_assemblies):
            krona_fpath = os.path.join(krona_res_dirpath, name + krona_txt_ext)
            with open(krona_fpath, 'a') as f_krona:
                if ref_name in taxons_for_krona:
                    f_krona.write(str(lengths[index]) + '\t' + taxons_for_krona[ref_name] + '\n')
                else:
                    f_krona.write(str(lengths[index]) + '\n')
        if ref_name in taxons_for_krona:
            krona_common_file.write(str(sum(lengths)) + '\t' + taxons_for_krona[ref_name] + '\n')
        else:
            krona_common_file.write(str(sum(lengths)) + '\n')
    krona_common_file.close()
    krona_fpaths = []
    krona_log_fpath = os.path.join(krona_res_dirpath, 'krona.log')
    krona_err_fpath = os.path.join(krona_res_dirpath, 'krona.err')
    open(krona_log_fpath, 'w').close()
    open(krona_err_fpath, 'w').close()
    for index, name in enumerate(assemblies):
        krona_fpath = os.path.join(krona_res_dirpath, name + '_taxonomy_chart.html')
        krona_txt_fpath = os.path.join(krona_res_dirpath, name + krona_txt_ext)
        return_code = qutils.call_subprocess(
            ['perl', '-I', krona_dirpath + '/lib', krona_dirpath + '/scripts/ImportText.pl', krona_txt_fpath, '-o', krona_fpath],
            stdout=open(krona_log_fpath, 'a'), stderr=open(krona_err_fpath, 'a'))
        if return_code != 0:
            meta_log.warning('Error occurred while Krona was processing assembly ' + name +
                             '. See Krona error log for details: %s' % krona_err_fpath)
        else:
            krona_fpaths.append(os.path.join(qconfig.krona_dirname, name + '_taxonomy_chart.html'))
            meta_log.main_info('  Krona chart for ' + name + ' is saved to ' + krona_fpath)
        if not qconfig.debug:
            os.remove(krona_txt_fpath)
    if len(krona_fpaths) > 1:
        name = 'summary'
        krona_fpath = os.path.join(krona_res_dirpath, name + '_taxonomy_chart.html')
        return_code = qutils.call_subprocess(
                        ['perl', '-I', krona_dirpath + '/lib', krona_dirpath + '/scripts/ImportText.pl', krona_common_fpath, '-o', krona_fpath],
                        stdout=open(krona_log_fpath, 'a'), stderr=open(krona_err_fpath, 'a'))
        if return_code != 0:
            meta_log.warning('Error occurred while Krona was building summary chart. '
                             'See Krona error log for details: %s' % krona_err_fpath)
        else:
            meta_log.main_info('  Summary Krona chart is saved to ' + krona_fpath)
            krona_fpaths.append(os.path.join(qconfig.krona_dirname, name + '_taxonomy_chart.html'))  # extra fpath!
    if not qconfig.debug:
        os.remove(krona_common_fpath)
    save_krona_paths(results_dirpath, krona_fpaths, assemblies)
Example #59
0
def bwa_index(ref_fpath, err_path, logger):
    cmd = [bwa_fpath('bwa'), 'index', '-p', ref_fpath, ref_fpath]
    if getsize(ref_fpath) > 2 * 1024 ** 3:  # if reference size bigger than 2GB
        cmd += ['-a', 'bwtsw']
    if not is_non_empty_file(ref_fpath + '.bwt'):
        qutils.call_subprocess(cmd, stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
Example #60
0
def calculate_genome_cov(in_fpath, out_fpath, chr_len_fpath, err_fpath, logger, print_all_positions=True):
    cmd = [bedtools_fpath('bedtools'), 'genomecov', '-ibam' if in_fpath.endswith('.bam') else '-i', in_fpath, '-g', chr_len_fpath]
    if print_all_positions:
        cmd += ['-bga']
    qutils.call_subprocess(cmd, stdout=open(out_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)