Exemple #1
0
def create_housekeeping_file(chr_lengths, max_points, root_dir, output_dir, logger):
    max_ideograms = len(chr_lengths.keys())
    template_fpath = None
    circos_bin_fpath = get_path_to_program('circos')
    if circos_bin_fpath:
        circos_dirpath = dirname(realpath(get_path_to_program('circos')))
        template_fpath = join(circos_dirpath, '..', 'libexec', 'etc', 'housekeeping.conf')
        if not is_non_empty_file(template_fpath):
            template_fpath = join(circos_dirpath, '..', 'etc', 'housekeeping.conf')

    if not is_non_empty_file(template_fpath):
        if not get_path_to_program('circos'):
            msg = 'Circos is not found.'
        else:
            msg = 'File etc/housekeeping.conf is not found.'
        logger.warning(msg + ' You will have to manually edit etc/housekeeping.conf: '
                       'set max_points_per_track to ' + str(max_points) + ' and max_ideograms to ' + str(max_ideograms))
        return '<<include %s>>\n' % join('etc', 'housekeeping.conf')

    housekeeping_fpath = join(output_dir, 'housekeeping.conf')
    with open(template_fpath) as f:
        with open(housekeeping_fpath, 'w') as out_f:
            for line in f:
                if 'max_points_per_track' in line:
                    out_f.write('max_points_per_track = %d\n' % max_points)
                elif 'max_ideograms' in line:
                    out_f.write('max_ideograms = %d\n' % max_ideograms)
                else:
                    out_f.write(line)
    return '<<include %s>>\n' % relpath(housekeeping_fpath, root_dir)
Exemple #2
0
def prepare_regular_quast_args(quast_py_args, combined_output_dirpath):
    opts_with_args_to_remove = ['--contig-thresholds', '--sv-bed',]
    opts_to_remove = ['-s', '--scaffolds', '--combined-ref']
    for opt in opts_with_args_to_remove:
        remove_from_quast_py_args(quast_py_args, opt, arg=True)
    for opt in opts_to_remove:
        remove_from_quast_py_args(quast_py_args, opt)

    quast_py_args += ['--no-check-meta']
    qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold >= qconfig.min_contig])
    if not qconfig.contig_thresholds:
        qconfig.contig_thresholds = 'None'
    quast_py_args += ['--contig-thresholds']
    quast_py_args += [qconfig.contig_thresholds]

    reads_stats_dirpath = os.path.join(combined_output_dirpath, qconfig.reads_stats_dirname)
    reference_name = qutils.name_from_fpath(qconfig.combined_ref_name)
    qconfig.bed = qconfig.bed or os.path.join(reads_stats_dirpath, reference_name + '.bed')
    qconfig.cov_fpath = qconfig.cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.cov')
    qconfig.phys_cov_fpath = qconfig.phys_cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.physical.cov')
    if qconfig.bed and is_non_empty_file(qconfig.bed):
        quast_py_args += ['--sv-bed']
        quast_py_args += [qconfig.bed]
    if qconfig.cov_fpath and is_non_empty_file(qconfig.cov_fpath):
        quast_py_args += ['--cov']
        quast_py_args += [qconfig.cov_fpath]
    if qconfig.phys_cov_fpath and is_non_empty_file(qconfig.phys_cov_fpath):
        quast_py_args += ['--phys-cov']
        quast_py_args += [qconfig.phys_cov_fpath]
Exemple #3
0
def bam_to_bed(output_dirpath, name, bam_fpath, err_path, logger, bedpe=False):
    raw_bed_fpath = join(output_dirpath, name + '.bed')
    if bedpe:
        bedpe_fpath = join(output_dirpath, name + '.bedpe')
        if not is_non_empty_file(bedpe_fpath) and not is_non_empty_file(
                bedpe_fpath):
            qutils.call_subprocess(
                [bedtools_fpath('bamToBed'), '-i', bam_fpath, '-bedpe'],
                stdout=open(bedpe_fpath, 'w'),
                stderr=open(err_path, 'a'),
                logger=logger)
            with open(bedpe_fpath, 'r') as bedpe:
                with open(raw_bed_fpath, 'w') as bed_file:
                    for line in bedpe:
                        fs = line.split()
                        start, end = fs[1], fs[5]
                        bed_file.write('\t'.join([fs[0], start, end + '\n']))
    else:
        if not is_non_empty_file(raw_bed_fpath):
            qutils.call_subprocess(
                [bedtools_fpath('bamToBed'), '-i', bam_fpath],
                stdout=open(raw_bed_fpath, 'w'),
                stderr=open(err_path, 'a'),
                logger=logger)

    sorted_bed_fpath = join(output_dirpath, name + '.sorted.bed')
    if not is_non_empty_file(sorted_bed_fpath):
        qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', raw_bed_fpath],
                               stdout=open(sorted_bed_fpath, 'w'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
    return sorted_bed_fpath
Exemple #4
0
def prepare_regular_quast_args(quast_py_args, combined_output_dirpath):
    opts_with_args_to_remove = ['--contig-thresholds', '--sv-bed',]
    opts_to_remove = ['-s', '--scaffolds', '--combined-ref']
    for opt in opts_with_args_to_remove:
        remove_from_quast_py_args(quast_py_args, opt, arg=True)
    for opt in opts_to_remove:
        remove_from_quast_py_args(quast_py_args, opt)

    quast_py_args += ['--no-check-meta']
    qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold >= qconfig.min_contig])
    if not qconfig.contig_thresholds:
        qconfig.contig_thresholds = 'None'
    quast_py_args += ['--contig-thresholds']
    quast_py_args += [qconfig.contig_thresholds]

    reads_stats_dirpath = os.path.join(combined_output_dirpath, qconfig.reads_stats_dirname)
    reference_name = qutils.name_from_fpath(qconfig.combined_ref_name)
    qconfig.bed = qconfig.bed or os.path.join(reads_stats_dirpath, reference_name + '.bed')
    qconfig.cov_fpath = qconfig.cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.cov')
    qconfig.phys_cov_fpath = qconfig.phys_cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.physical.cov')
    if qconfig.bed and is_non_empty_file(qconfig.bed):
        quast_py_args += ['--sv-bed']
        quast_py_args += [qconfig.bed]
    if qconfig.cov_fpath and is_non_empty_file(qconfig.cov_fpath):
        quast_py_args += ['--cov']
        quast_py_args += [qconfig.cov_fpath]
    if qconfig.phys_cov_fpath and is_non_empty_file(qconfig.phys_cov_fpath):
        quast_py_args += ['--phys-cov']
        quast_py_args += [qconfig.phys_cov_fpath]
Exemple #5
0
def create_housekeeping_file(chr_lengths, max_points, root_dir, output_dir, logger):
    max_ideograms = len(chr_lengths.keys())
    template_fpath = None
    circos_bin_fpath = get_path_to_program('circos')
    if circos_bin_fpath:
        circos_dirpath = dirname(realpath(get_path_to_program('circos')))
        template_fpath = join(circos_dirpath, '..', 'libexec', 'etc', 'housekeeping.conf')
        if not is_non_empty_file(template_fpath):
            template_fpath = join(circos_dirpath, '..', 'etc', 'housekeeping.conf')

    if not is_non_empty_file(template_fpath):
        if not get_path_to_program('circos'):
            msg = 'Circos is not found.'
        else:
            msg = 'File etc/housekeeping.conf is not found.'
        logger.warning(msg + ' You will have to manually edit etc/housekeeping.conf: '
                       'set max_points_per_track to ' + str(max_points) + ' and max_ideograms to ' + str(max_ideograms))
        return '<<include %s>>\n' % join('etc', 'housekeeping.conf')

    housekeeping_fpath = join(output_dir, 'housekeeping.conf')
    with open(template_fpath) as f:
        with open(housekeeping_fpath, 'w') as out_f:
            for line in f:
                if 'max_points_per_track' in line:
                    out_f.write('max_points_per_track = %d\n' % max_points)
                elif 'max_ideograms' in line:
                    out_f.write('max_ideograms = %d\n' % max_ideograms)
                else:
                    out_f.write(line)
    return '<<include %s>>\n' % relpath(housekeeping_fpath, root_dir)
Exemple #6
0
def count_kmers(tmp_dirpath, fpath, log_fpath, err_fpath, can_reuse=True):
    kmc_out_fpath = join(tmp_dirpath, basename(fpath) + '.kmc')
    if can_reuse and is_non_empty_file(kmc_out_fpath + '.kmc_pre') and is_non_empty_file(kmc_out_fpath + '.kmc_suf'):
        return kmc_out_fpath
    max_mem = max(2, get_total_memory() // 4)
    run_kmc(kmc_bin_fpath, ['-m' + str(max_mem), '-k' + str(KMERS_LEN), '-fm', '-cx1', '-ci1',
                        fpath, kmc_out_fpath, tmp_dirpath], log_fpath, err_fpath)
    return kmc_out_fpath
Exemple #7
0
def count_kmers(tmp_dirpath, fpath, log_fpath, err_fpath, can_reuse=True):
    kmc_out_fpath = join(tmp_dirpath, basename(fpath) + '.kmc')
    if can_reuse and is_non_empty_file(kmc_out_fpath + '.kmc_pre') and is_non_empty_file(kmc_out_fpath + '.kmc_suf'):
        return kmc_out_fpath
    max_mem = max(2, get_free_memory())
    run_kmc(['-m' + str(max_mem), '-k' + str(KMERS_LEN), '-fm', '-cx1', '-ci1', fpath, kmc_out_fpath, tmp_dirpath],
            log_fpath, err_fpath, use_kmc_tools=False)
    return kmc_out_fpath
Exemple #8
0
def align_reference(ref_fpath, output_dir, using_reads='all', calculate_coverage=False):
    required_files = []
    ref_name = qutils.name_from_fpath(ref_fpath)
    cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    if using_reads != 'all':
        cov_fpath = add_suffix(cov_fpath, using_reads)
        uncovered_fpath = add_suffix(uncovered_fpath, using_reads)
    insert_size_fpath = join(output_dir, ref_name + '.is.txt')
    if not is_non_empty_file(uncovered_fpath):
        required_files.append(uncovered_fpath)
    if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'pe'):
        required_files.append(insert_size_fpath)

    temp_output_dir = join(output_dir, 'temp_output')
    if not isdir(temp_output_dir):
        os.makedirs(temp_output_dir)

    log_path = join(output_dir, 'reads_stats.log')
    err_fpath = join(output_dir, 'reads_stats.err')
    correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                                                qconfig.max_threads, sam_fpath=qconfig.reference_sam,
                                                                bam_fpath=qconfig.reference_bam, required_files=required_files,
                                                                is_reference=True, alignment_only=True, using_reads=using_reads)
    if not qconfig.optimal_assembly_insert_size or qconfig.optimal_assembly_insert_size == 'auto':
        if using_reads == 'pe' and sam_fpath:
            insert_size, std_dev = calculate_insert_size(sam_fpath, output_dir, ref_name)
            if not insert_size:
                logger.info('  Failed calculating insert size.')
            else:
                qconfig.optimal_assembly_insert_size = insert_size
        elif using_reads == 'all' and is_non_empty_file(insert_size_fpath):
            try:
                insert_size = int(open(insert_size_fpath).readline())
                if insert_size:
                    qconfig.optimal_assembly_insert_size = insert_size
            except:
                pass

    if not required_files:
        return sam_fpath, bam_fpath, uncovered_fpath
    if not sam_fpath:
        logger.info('  Failed detecting uncovered regions.')
        return None, None

    if calculate_coverage:
        bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
        bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

        if is_non_empty_file(bam_sorted_fpath):
            logger.info('  Using existing sorted BAM-file: ' + bam_sorted_fpath)
        else:
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        if not is_non_empty_file(uncovered_fpath) and calculate_coverage:
            get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath,
                         correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return sam_fpath, bam_fpath, uncovered_fpath
Exemple #9
0
def align_reference(ref_fpath, output_dir, using_reads='all'):
    required_files = []
    ref_name = qutils.name_from_fpath(ref_fpath)
    cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    if using_reads != 'all':
        cov_fpath = add_suffix(cov_fpath, using_reads)
        uncovered_fpath = add_suffix(uncovered_fpath, using_reads)
    insert_size_fpath = join(output_dir, ref_name + '.is.txt')
    if not is_non_empty_file(uncovered_fpath):
        required_files.append(uncovered_fpath)
    if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'paired_end'):
        required_files.append(insert_size_fpath)

    temp_output_dir = join(output_dir, 'temp_output')
    if not isdir(temp_output_dir):
        os.makedirs(temp_output_dir)

    log_path = join(output_dir, 'reads_stats.log')
    err_fpath = join(output_dir, 'reads_stats.err')
    correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                                                qconfig.max_threads, sam_fpath=qconfig.reference_sam,
                                                                bam_fpath=qconfig.reference_bam, required_files=required_files,
                                                                is_reference=True, alignment_only=True, using_reads=using_reads)
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath

    if not qconfig.ideal_assembly_insert_size or qconfig.ideal_assembly_insert_size == 'auto':
        if using_reads == 'paired_end' and sam_fpath:
            insert_size = calculate_insert_size(sam_fpath, output_dir, ref_name)
            if not insert_size:
                logger.info('  Failed calculating insert size.')
            else:
                qconfig.ideal_assembly_insert_size = insert_size

    if not required_files:
        return bam_fpath, uncovered_fpath
    if not sam_fpath:
        logger.info('  Failed detecting uncovered regions.')
        return None, None

    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(bam_sorted_fpath):
        logger.info('  Using existing sorted BAM-file: ' + bam_sorted_fpath)
    else:
        sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
        sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
    if not is_non_empty_file(uncovered_fpath):
        get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath,
                     correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return bam_fpath, uncovered_fpath
Exemple #10
0
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath):
    tmp_bam_fpaths = []
    for tmp_sam_fpath in tmp_sam_fpaths:
        if is_non_empty_file(tmp_sam_fpath):
            tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam')
            tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted')
            if not is_non_empty_file(tmp_bam_sorted_fpath):
                sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger)
            tmp_bam_fpaths.append(tmp_bam_sorted_fpath)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), bam_fpath] + tmp_bam_fpaths,
                           stderr=open(err_fpath, 'a'), logger=logger)
    sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger)
    return sam_fpath
Exemple #11
0
def do(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, gc_fpath,
       features_containers, cov_fpath, output_dir, logger):
    if not exists(output_dir):
        os.makedirs(output_dir)
    conf_fpath, circos_legend_fpath = create_conf(ref_fpath, contigs_fpaths,
                                                  contig_report_fpath_pattern,
                                                  output_dir, gc_fpath,
                                                  features_containers,
                                                  cov_fpath, logger)
    circos_exec = get_path_to_program('circos')
    if not circos_exec:
        logger.warning(
            'Circos is not installed!\n'
            'If you want to create Circos plots, install Circos as described at http://circos.ca/tutorials/lessons/configuration/distribution_and_installation '
            'and run the following command:\n\tcircos -conf ' + conf_fpath +
            '\n'
            'The plot legend is saved to ' + circos_legend_fpath + '\n')
        return None, None

    cmdline = [circos_exec, '-conf', conf_fpath]
    log_fpath = join(output_dir, 'circos.log')
    err_fpath = join(output_dir, 'circos.err')
    circos_png_fpath = join(output_dir, circos_png_fname)
    return_code = qutils.call_subprocess(cmdline,
                                         stdout=open(log_fpath, 'w'),
                                         stderr=open(err_fpath, 'w'))
    if return_code == 0 and is_non_empty_file(circos_png_fpath):
        return circos_png_fpath, circos_legend_fpath
    else:
        logger.warning('  Circos diagram was not created. See ' + log_fpath +
                       ' and ' + err_fpath + ' for details')
        return None, None
Exemple #12
0
def check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpaths, assemblies, labels):
    downloaded_organisms = []
    not_founded_organisms = []
    blast_assemblies = [assembly for assembly in assemblies]
    for i, assembly_fpath in enumerate(assemblies_fpaths):
        check_fpath = get_blast_output_fpath(blast_check_fpath, labels[i])
        res_fpath = get_blast_output_fpath(blast_res_fpath, labels[i])
        existing_assembly = None
        assembly_info = True
        if os.path.exists(check_fpath) and is_non_empty_file(res_fpath):
            for line in open(check_fpath):
                if '---' in line:
                    assembly_info = False
                if line and assembly_info:
                    assembly, size = line.split()[1], line.split()[3]
                    if assembly in files_sizes.keys() and int(size) == files_sizes[assembly]:
                        existing_assembly = assemblies_fpaths[assembly]
                        logger.main_info('  Using existing BLAST alignments for %s... ' % labels[i])
                        blast_assemblies.remove(existing_assembly)
                elif line and existing_assembly:
                    line = line.split(' ')
                    if len(line) > 1:
                        if line[0] == 'Downloaded:':
                            downloaded_organisms += line[1].rstrip().split(',')
                        elif line[0] == 'Not_founded:':
                            not_founded_organisms += line[1].rstrip().split(',')
    return blast_assemblies, set(downloaded_organisms), set(not_founded_organisms)
Exemple #13
0
def align_contigs(output_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath):
    log_out_f = open(log_out_fpath, 'w')

    successful_check_fpath = out_basename + '.sf'
    log_out_f.write('Aligning contigs to reference...\n')

    # Checking if there are existing previous alignments.
    # If they exist, using them to save time.
    using_existing_alignments = False
    if isfile(successful_check_fpath) and isfile(output_fpath):
        if check_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath):
            log_out_f.write('\tUsing existing alignments...\n')
            logger.info('  ' + qutils.index_to_str(index) + 'Using existing alignments... ')
            using_existing_alignments = True

    if not using_existing_alignments:
        log_out_f.write('\tAligning contigs to the reference\n')
        logger.info('  ' + qutils.index_to_str(index) + 'Aligning contigs to the reference')

        tmp_output_fpath = output_fpath + '_tmp'
        exit_code = run_minimap(tmp_output_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, threads)
        if exit_code != 0:
            return AlignerStatus.ERROR

        if not isfile(tmp_output_fpath):
            return AlignerStatus.FAILED
        if not is_non_empty_file(tmp_output_fpath):
            return AlignerStatus.NOT_ALIGNED

        create_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath)
        log_out_f.write('Filtering alignments...\n')
        parse_minimap_output(tmp_output_fpath, output_fpath)
    return AlignerStatus.OK
Exemple #14
0
def create_genes_plot(features_containers, window_size, ref_len, output_dir):
    feature_fpaths = []
    max_points = 0
    if not features_containers:
        return feature_fpaths, max_points

    for feature_container in features_containers:
        feature_fpath = join(output_dir, feature_container.kind + '.txt')
        if len(feature_container.region_list) == 0:
            continue

        num_points = 0
        gene_density_by_chrom = defaultdict(lambda : [0] * (ref_len // window_size + 1))
        with open(feature_fpath, 'w') as out_f:
            for region in feature_container.region_list:
                chrom = region.chromosome if region.chromosome and region.chromosome in feature_container.chr_names_dict \
                    else region.seqname
                chrom = feature_container.chr_names_dict[chrom] if chrom in feature_container.chr_names_dict else None
                if not chrom:
                    continue
                for i in range(region.start // window_size, min(region.end // window_size + 1, len(gene_density_by_chrom[chrom]))):
                    if i < len(gene_density_by_chrom[chrom]):
                        gene_density_by_chrom[chrom][i] += 1
            for chrom, gene_density_list in gene_density_by_chrom.items():
                for i, density in enumerate(gene_density_list):
                    out_f.write('\t'.join([chrom, str(i * window_size), str(((i + 1) * window_size)), str(density)]) + '\n')
                    num_points += 1
        if is_non_empty_file(feature_fpath):
            feature_fpaths.append(feature_fpath)
        max_points = max(max_points, num_points)
    return feature_fpaths, max_points
Exemple #15
0
def calculate_insert_size(sam_fpath, output_dir, ref_name):
    insert_size_fpath = join(output_dir, ref_name + '.is.txt')
    if is_non_empty_file(insert_size_fpath):
        try:
            insert_size = int(open(insert_size_fpath).read())
            if insert_size:
                return insert_size
        except:
            pass
    insert_sizes = []
    mapped_flags = ['99', '147', '83', '163']  # reads mapped in correct orientation and within insert size
    with open(sam_fpath) as sam_in:
        for i, l in enumerate(sam_in):
            if i > 1000000:
                break
            if l.startswith('@'):
                continue
            fs = l.split('\t')
            flag = fs[1]
            if flag not in mapped_flags:
                continue
            insert_size = abs(int(fs[8]))
            insert_sizes.append(insert_size)

    if insert_sizes:
        mean_is = sum(insert_sizes) * 1.0 / len(insert_sizes)
        if mean_is <= 0:
            return None
        stddev_is = sqrt(sum([(insert_size - mean_is) ** 2 for insert_size in insert_sizes]) / len(insert_sizes))
        insert_size = int(mean_is + stddev_is)
        insert_size = max(qconfig.ideal_assembly_min_IS, insert_size)
        insert_size = min(qconfig.ideal_assembly_max_IS, insert_size)
        with open(insert_size_fpath, 'w') as out_f:
            out_f.write(str(insert_size))
        return insert_size
Exemple #16
0
def align_contigs(output_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath):
    log_out_f = open(log_out_fpath, 'w')

    successful_check_fpath = out_basename + '.sf'
    log_out_f.write('Aligning contigs to reference...\n')

    # Checking if there are existing previous alignments.
    # If they exist, using them to save time.
    using_existing_alignments = False
    if isfile(successful_check_fpath) and isfile(output_fpath):
        if check_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath):
            log_out_f.write('\tUsing existing alignments...\n')
            logger.info('  ' + qutils.index_to_str(index) + 'Using existing alignments... ')
            using_existing_alignments = True

    if not using_existing_alignments:
        log_out_f.write('\tAligning contigs to the reference\n')
        logger.info('  ' + qutils.index_to_str(index) + 'Aligning contigs to the reference')

        tmp_output_fpath = output_fpath + '_tmp'
        exit_code = run_minimap(tmp_output_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, threads)
        if exit_code != 0:
            return AlignerStatus.ERROR

        if not isfile(tmp_output_fpath):
            return AlignerStatus.FAILED
        if not is_non_empty_file(tmp_output_fpath):
            return AlignerStatus.NOT_ALIGNED

        create_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath)
        log_out_f.write('Filtering alignments...\n')
        parse_minimap_output(tmp_output_fpath, output_fpath)
    return AlignerStatus.OK
Exemple #17
0
def correct_paired_reads_names(fpath, name_ending, output_dir, logger):
    name, ext = os.path.splitext(fpath)
    try:
        if ext in ['.gz', '.gzip']:
            handler = gzip.open(fpath, mode='rt')
            corrected_fpath = join(output_dir, basename(name))
        else:
            handler = open(fpath)
            corrected_fpath = join(output_dir, basename(fpath))
    except IOError:
        return False
    if is_non_empty_file(corrected_fpath):
        logger.info('Using existing FASTQ file ' + corrected_fpath)
        return corrected_fpath
    with handler as f:
        with open(corrected_fpath, 'w') as out_f:
            for i, line in enumerate(f):
                if i % 4 == 0:
                    full_read_name = line.split()[0] + name_ending
                    out_f.write(full_read_name + '\n')
                elif i % 2 == 0:
                    out_f.write('+\n')
                else:
                    out_f.write(line)
    return corrected_fpath
Exemple #18
0
def check_blast(blast_check_fpath, blast_res_fpath, files_md5,
                assemblies_fpaths, assemblies, labels):
    downloaded_organisms = []
    not_founded_organisms = []
    blast_assemblies = [assembly for assembly in assemblies]
    for i, assembly_fpath in enumerate(assemblies_fpaths):
        check_fpath = get_blast_output_fpath(blast_check_fpath, labels[i])
        res_fpath = get_blast_output_fpath(blast_res_fpath, labels[i])
        existing_assembly = None
        assembly_info = True
        if os.path.exists(check_fpath) and is_non_empty_file(res_fpath):
            with open(check_fpath) as check_file:
                for line in check_file:
                    if '---' in line:
                        assembly_info = False
                    if line and assembly_info:
                        assembly, md5 = line.split()[1], line.split()[-1]
                        if assembly in files_md5.keys(
                        ) and md5 == files_md5[assembly]:
                            existing_assembly = assemblies_fpaths[assembly]
                            logger.main_info(
                                '  Using existing BLAST alignments for %s... '
                                % labels[i])
                            blast_assemblies.remove(existing_assembly)
                    elif line and existing_assembly:
                        line = line.split(' ')
                        if len(line) > 1:
                            if line[0] == 'Downloaded:':
                                downloaded_organisms += line[1].rstrip().split(
                                    ',')
                            elif line[0] == 'Not_founded:':
                                not_founded_organisms += line[1].rstrip(
                                ).split(',')
    return blast_assemblies, set(downloaded_organisms), set(
        not_founded_organisms)
Exemple #19
0
def align_contigs(output_fpath, out_basename, ref_fpath, contigs_fpath,
                  old_contigs_fpath, index, threads, log_out_fpath,
                  log_err_fpath):
    log_out_f = open(log_out_fpath, 'w')

    successful_check_fpath = out_basename + '.sf'
    log_out_f.write('Aligning contigs to reference...\n')

    # Special case: if there is a need to reuse alignments from the combined_reference stage
    if qconfig.alignments_for_reuse_dirpath is not None and os.path.isdir(
            qconfig.alignments_for_reuse_dirpath):
        _, coords_to_reuse_fname, _, _ = get_aux_out_fpaths(
            os.path.basename(out_basename))
        coords_to_reuse_fpath = os.path.join(
            qconfig.alignments_for_reuse_dirpath, coords_to_reuse_fname)
        if isfile(coords_to_reuse_fpath):
            # symlink coords.filtered from combined_reference stage to coords in the current run
            if isfile(output_fpath):
                os.remove(output_fpath)
            os.symlink(
                os.path.relpath(coords_to_reuse_fpath,
                                os.path.dirname(output_fpath)), output_fpath)
            log_out_f.write(
                '\tReusing alignments from the combined_reference stage...\n')
            logger.info(
                '  ' + qutils.index_to_str(index) +
                'Reusing alignments from the combined_reference stage... ')
            return AlignerStatus.OK
    qconfig.alignments_for_reuse_dirpath = None

    # Checking if there are existing previous alignments.
    # If they exist, using them to save time.
    if isfile(successful_check_fpath) and isfile(output_fpath):
        if check_successful_check(successful_check_fpath, old_contigs_fpath,
                                  ref_fpath):
            log_out_f.write('\tUsing existing alignments...\n')
            logger.info('  ' + qutils.index_to_str(index) +
                        'Using existing alignments... ')
            return AlignerStatus.OK

    log_out_f.write('\tAligning contigs to the reference\n')
    logger.info('  ' + qutils.index_to_str(index) +
                'Aligning contigs to the reference')

    tmp_output_fpath = output_fpath + '_tmp'
    exit_code = run_minimap(tmp_output_fpath, ref_fpath, contigs_fpath,
                            log_err_fpath, index, threads)
    if exit_code != 0:
        return AlignerStatus.ERROR

    if not isfile(tmp_output_fpath):
        return AlignerStatus.FAILED
    if not is_non_empty_file(tmp_output_fpath):
        return AlignerStatus.NOT_ALIGNED

    create_successful_check(successful_check_fpath, old_contigs_fpath,
                            ref_fpath)
    log_out_f.write('Filtering alignments...\n')
    parse_minimap_output(tmp_output_fpath, output_fpath)
    return AlignerStatus.OK
Exemple #20
0
def create_genes_plot(features_containers, window_size, ref_len, output_dir):
    feature_fpaths = []
    max_points = 0
    if not features_containers:
        return feature_fpaths, max_points

    for feature_container in features_containers:
        feature_fpath = join(output_dir, feature_container.kind + '.txt')
        if len(feature_container.region_list) == 0:
            continue

        num_points = 0
        gene_density_by_chrom = defaultdict(lambda : [0] * (ref_len // window_size + 1))
        with open(feature_fpath, 'w') as out_f:
            for region in feature_container.region_list:
                chrom = region.chromosome if region.chromosome and region.chromosome in feature_container.chr_names_dict \
                    else region.seqname
                chrom = feature_container.chr_names_dict[chrom] if chrom in feature_container.chr_names_dict else None
                if not chrom:
                    continue
                for i in range(region.start // window_size, min(region.end // window_size + 1, len(gene_density_by_chrom[chrom]))):
                    if i < len(gene_density_by_chrom[chrom]):
                        gene_density_by_chrom[chrom][i] += 1
            for chrom, gene_density_list in gene_density_by_chrom.items():
                for i, density in enumerate(gene_density_list):
                    out_f.write('\t'.join([chrom, str(i * window_size), str(((i + 1) * window_size)), str(density)]) + '\n')
                    num_points += 1
        if is_non_empty_file(feature_fpath):
            feature_fpaths.append(feature_fpath)
        max_points = max(max_points, num_points)
    return feature_fpaths, max_points
Exemple #21
0
def correct_paired_reads_names(fpath, name_ending, output_dir, logger):
    name, ext = os.path.splitext(fpath)
    try:
        if ext in ['.gz', '.gzip']:
            handler = gzip.open(fpath, mode='rt')
            corrected_fpath = join(output_dir, basename(name))
        else:
            handler = open(fpath)
            corrected_fpath = join(output_dir, basename(fpath))
    except IOError:
        return False
    if is_non_empty_file(corrected_fpath):
        logger.info('Using existing FASTQ file ' + corrected_fpath)
        return corrected_fpath
    with handler as f:
        with open(corrected_fpath, 'w') as out_f:
            for i, line in enumerate(f):
                if i % 4 == 0:
                    full_read_name = line.split()[0] + name_ending
                    out_f.write(full_read_name + '\n')
                elif i % 2 == 0:
                    out_f.write('+\n')
                else:
                    out_f.write(line)
    return corrected_fpath
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False):
    red_genome_dir = os.path.join(tmp_dir, 'tmp_red')
    if isdir(red_genome_dir):
        shutil.rmtree(red_genome_dir)
    os.makedirs(red_genome_dir)

    ref_name = qutils.name_from_fpath(ref_fpath)
    ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa')  ## Red recognizes only *.fa files
    if os.path.islink(ref_symlink):
        os.remove(ref_symlink)
    os.symlink(ref_fpath, ref_symlink)

    logger.info('  ' + 'Running repeat masking tool...')
    repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt')
    if is_non_empty_file(repeats_fpath):
        return_code = 0
        logger.info('  ' + 'Using existing file ' + repeats_fpath + '...')
    else:
        return_code = qutils.call_subprocess([binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5'],
                                             stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent='    ')
    if return_code == 0 and repeats_fpath and exists(repeats_fpath):
        long_repeats_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt')
        with open(long_repeats_fpath, 'w') as out:
            with open(repeats_fpath) as in_f:
                for line in in_f:
                    l = line.split('\t')
                    repeat_len = int(l[2]) - int(l[1])
                    if repeat_len >= insert_size:
                        out.write(line[1:])

        repeats_fasta_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta')
        coords_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt')
        if not is_non_empty_file(coords_fpath):
            fasta_index_fpath = ref_fpath + '.fai'
            if exists(fasta_index_fpath):
                os.remove(fasta_index_fpath)
            qutils.call_subprocess([bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed',
                                    long_repeats_fpath, '-fo', repeats_fasta_fpath],
                                    stderr=open(log_fpath, 'w'), indent='    ')
            cmdline = [minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100',
                       '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath]
            qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a'))
        filtered_repeats_fpath, repeats_regions = check_repeats_instances(coords_fpath, long_repeats_fpath, use_long_reads)
        unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath)
        return unique_covered_regions, repeats_regions
    return None, None
Exemple #23
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None):
    ref_name = qutils.name_from_fpath(cur_ref_fpath)
    if not bam_fpath:
        sam_fpath = join(output_dirpath, ref_name + '.sam')
        bam_fpath = join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam')
    else:
        sam_fpath = bam_fpath.replace('.bam', '.sam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed')
    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    if not isfile(bam_sorted_fpath):
        sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
        sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads)
    if not is_non_empty_file(bam_sorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath],
                               stderr=open(err_fpath, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss')
    vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf')
    if not is_non_empty_file(vcf_fpath):
        if isdir(vcf_output_dirpath):
            shutil.rmtree(vcf_output_dirpath, ignore_errors=True)
        os.makedirs(vcf_output_dirpath)
        max_mem = get_gridss_memory()
        env = os.environ.copy()
        env["PATH"] += os.pathsep + bwa_dirpath
        bwa_index(cur_ref_fpath, err_fpath, logger)
        qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true',
                                '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true',
                                '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath,
                                'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath,
                                'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath],
                                stderr=open(err_fpath, 'a'), logger=logger, env=env)
    if is_non_empty_file(vcf_fpath):
        raw_bed_fpath = add_suffix(bed_fpath, 'raw')
        filtered_bed_fpath = add_suffix(bed_fpath, 'filtered')
        qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe',
                                'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath,
                                'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger)
        reformat_bedpe(raw_bed_fpath, bed_fpath)
    return bed_fpath
Exemple #24
0
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None):
    ref_name = qutils.name_from_fpath(cur_ref_fpath)
    if not bam_fpath:
        sam_fpath = join(output_dirpath, ref_name + '.sam')
        bam_fpath = join(output_dirpath, ref_name + '.bam')
        bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam')
    else:
        sam_fpath = bam_fpath.replace('.bam', '.sam')
        bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed')
    if is_non_empty_file(bed_fpath):
        logger.info('  Using existing BED-file: ' + bed_fpath)
        return bed_fpath

    if not isfile(bam_sorted_fpath):
        sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped and proper_pair')
        sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads)
    if not is_non_empty_file(bam_sorted_fpath + '.bai'):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath],
                               stderr=open(err_fpath, 'a'), logger=logger)
    create_fai_file(cur_ref_fpath)
    vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss')
    vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf')
    if not is_non_empty_file(vcf_fpath):
        if isdir(vcf_output_dirpath):
            shutil.rmtree(vcf_output_dirpath, ignore_errors=True)
        os.makedirs(vcf_output_dirpath)
        max_mem = get_gridss_memory()
        env = os.environ.copy()
        env["PATH"] += os.pathsep + bwa_dirpath
        bwa_index(cur_ref_fpath, err_fpath, logger)
        qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true',
                                '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true',
                                '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath,
                                'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath,
                                'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath],
                                stderr=open(err_fpath, 'a'), logger=logger, env=env)
    if is_non_empty_file(vcf_fpath):
        raw_bed_fpath = add_suffix(bed_fpath, 'raw')
        filtered_bed_fpath = add_suffix(bed_fpath, 'filtered')
        qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe',
                                'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath,
                                'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger)
        reformat_bedpe(raw_bed_fpath, bed_fpath)
    return bed_fpath
Exemple #25
0
def bwa_index(ref_fpath, err_path, logger):
    cmd = [bwa_fpath('bwa'), 'index', '-p', ref_fpath, ref_fpath]
    if getsize(ref_fpath) > 2 * 1024**3:  # if reference size bigger than 2GB
        cmd += ['-a', 'bwtsw']
    if not is_non_empty_file(ref_fpath + '.bwt'):
        qutils.call_subprocess(cmd,
                               stdout=open(err_path, 'a'),
                               stderr=open(err_path, 'a'),
                               logger=logger)
Exemple #26
0
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, output_dir, max_threads, err_fpath):
    merged_bam_fpath = add_suffix(bam_fpath, 'merged')
    tmp_bam_fpaths = []
    for tmp_sam_fpath in tmp_sam_fpaths:
        if is_non_empty_file(tmp_sam_fpath):
            tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam')
            tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted')
            if not is_non_empty_file(tmp_bam_sorted_fpath):
                sambamba_view(tmp_sam_fpath, tmp_bam_fpath, max_threads, err_fpath, logger, filter_rule=None)
                sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger)
            tmp_bam_fpaths.append(tmp_bam_sorted_fpath)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), merged_bam_fpath] + tmp_bam_fpaths,
                           stderr=open(err_fpath, 'a'), logger=logger)
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir',
                            output_dir, merged_bam_fpath, bam_fpath],
                           stderr=open(err_fpath, 'a'), logger=logger)
    sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger)
    return merged_bam_fpath
Exemple #27
0
def run(contigs_fpath, gff_fpath, log_fpath, threads, kingdom):
    barrnap_fpath = join(qconfig.LIBS_LOCATION, 'barrnap', 'bin', 'barrnap')
    if is_non_empty_file(gff_fpath):
        return
    call_subprocess([
        barrnap_fpath, '--quiet', '-k', kingdom, '--threads',
        str(threads), contigs_fpath
    ],
                    stdout=open(gff_fpath, 'w'),
                    stderr=open(log_fpath, 'a'))
Exemple #28
0
def get_joiners(ref_name, sam_fpath, bam_fpath, output_dirpath, err_fpath,
                using_reads):
    bam_filtered_fpath = add_suffix(bam_fpath, 'filtered')
    if not is_non_empty_file(bam_filtered_fpath):
        filter_rule = 'not unmapped and not supplementary and not secondary_alignment'
        sambamba_view(bam_fpath,
                      bam_filtered_fpath,
                      qconfig.max_threads,
                      err_fpath,
                      logger,
                      filter_rule=filter_rule)
    bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    if not is_non_empty_file(bam_sorted_fpath):
        sort_bam(bam_filtered_fpath,
                 bam_sorted_fpath,
                 err_fpath,
                 logger,
                 sort_rule='-n')
    bed_fpath = bam_to_bed(output_dirpath,
                           using_reads,
                           bam_sorted_fpath,
                           err_fpath,
                           logger,
                           bedpe=using_reads == 'mp')
    intervals = defaultdict(list)
    if using_reads == 'mp':
        insert_size, std_dev = calculate_insert_size(sam_fpath,
                                                     output_dirpath,
                                                     ref_name,
                                                     reads_suffix='mp')
        min_is = insert_size - std_dev
        max_is = insert_size + std_dev
    with open(bed_fpath) as bed:
        for l in bed:
            fs = l.split()
            if using_reads == 'mp' and insert_size:
                interval_len = int(fs[2]) - int(fs[1])
                if min_is <= abs(interval_len) <= max_is:
                    intervals[fs[0]].append((int(fs[1]), int(fs[2])))
            else:
                intervals[fs[0]].append((int(fs[1]), int(fs[2])))
    return intervals
Exemple #29
0
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type):
    bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads)
    insert_sizes = []
    for idx, reads in enumerate(read_fpaths):
        if isinstance(reads, str):
            if reads_type == 'pacbio' or reads_type == 'nanopore':
                if reads_type == 'pacbio':
                    preset = ' -ax map-pb '
                else:
                    preset = ' -ax map-ont '
                cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads
            else:
                cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads
        else:
            read1, read2 = reads
            cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2
        output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1))
        bam_fpath = output_fpath.replace('.sam', '.bam')
        if not is_non_empty_file(output_fpath):
            qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        if not is_non_empty_file(bam_fpath):
            if not is_non_empty_file(bam_fpath):
                sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)
            if reads_type == 'pe':
                bam_dedup_fpath = add_suffix(bam_fpath, 'dedup')
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir',
                                        output_dir, bam_fpath, bam_dedup_fpath],
                                        stderr=open(err_fpath, 'a'), logger=logger)
                if exists(bam_dedup_fpath):
                    shutil.move(bam_dedup_fpath, bam_fpath)
        if reads_type == 'pe':
            insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath))
            if insert_size < qconfig.optimal_assembly_max_IS:
                insert_sizes.append(insert_size)
        out_sam_fpaths.append(output_fpath)

    if insert_sizes:
        qconfig.optimal_assembly_insert_size = max(insert_sizes)
        ref_name = qutils.name_from_fpath(ref_fpath)
        insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt')
        with open(insert_size_fpath, 'w') as out:
            out.write(str(qconfig.optimal_assembly_insert_size))
Exemple #30
0
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names,
                 cov_fpath, physical_cov_fpath=None, uncovered_fpath=None, create_cov_files=True):
    raw_cov_fpath = cov_fpath + '_raw'
    chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names)
    if not is_non_empty_file(cov_fpath):
        logger.info('  Calculating reads coverage...')
        if not is_non_empty_file(raw_cov_fpath):
            if not is_non_empty_file(bam_sorted_fpath):
                sort_bam(bam_fpath, bam_sorted_fpath, log_path, err_fpath, logger)
            calculate_genome_cov(bam_sorted_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger)
            qutils.assert_file_exists(raw_cov_fpath, 'coverage file')
        if uncovered_fpath:
            print_uncovered_regions(raw_cov_fpath, uncovered_fpath, correct_chr_names)
        if create_cov_files:
            proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names)
    if not is_non_empty_file(physical_cov_fpath) and create_cov_files:
        raw_cov_fpath = get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath,
                                              physical_cov_fpath, chr_len_fpath)
        proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names)
    return cov_fpath, physical_cov_fpath
Exemple #31
0
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names,
                 cov_fpath, physical_cov_fpath=None, uncovered_fpath=None, create_cov_files=True):
    raw_cov_fpath = cov_fpath + '_raw'
    chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names)
    if not is_non_empty_file(cov_fpath):
        logger.info('  Calculating reads coverage...')
        if not is_non_empty_file(raw_cov_fpath):
            if not is_non_empty_file(bam_sorted_fpath):
                sort_bam(bam_fpath, bam_sorted_fpath, log_path, err_fpath, logger)
            calculate_genome_cov(bam_sorted_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger)
            qutils.assert_file_exists(raw_cov_fpath, 'coverage file')
        if uncovered_fpath:
            print_uncovered_regions(raw_cov_fpath, uncovered_fpath, correct_chr_names)
        if create_cov_files:
            proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names)
    if not is_non_empty_file(physical_cov_fpath) and create_cov_files:
        raw_cov_fpath = get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath,
                                              physical_cov_fpath, chr_len_fpath)
        proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names)
    return cov_fpath, physical_cov_fpath
Exemple #32
0
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath):
    sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam')
    bam_fpath = sam_fpath.replace('.sam', '.bam')
    bam_mapped_fpath = add_suffix(bam_fpath, 'mapped')
    bam_sorted_fpath = add_suffix(bam_fpath, 'sorted')
    if not is_non_empty_file(bam_fpath):
        bwa_index(ref_fpath, err_fpath, logger)
        qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath],
                               stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
    if not is_non_empty_file(bam_sorted_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam',
                                '-F', 'not unmapped', bam_fpath],
                               stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
        sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
    cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov')
    uncovered_fpath = add_suffix(cov_fpath, 'uncovered')
    ref_name = qutils.name_from_fpath(ref_fpath)
    correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger)
    get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath,
                 correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False)
    return uncovered_fpath
Exemple #33
0
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath):
    if not isfile(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    raw_cov_fpath = add_suffix(cov_fpath, 'raw')
    if not is_non_empty_file(raw_cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam')
        sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger,
                      filter_rule='proper_pair and not supplementary and not duplicate')
        ## sort by read names
        bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam')
        sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
        bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True)
        calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger)
    return raw_cov_fpath
Exemple #34
0
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath):
    if not isfile(bedtools_fpath('bamToBed')):
        logger.info('  Failed calculating physical coverage...')
        return None
    raw_cov_fpath = add_suffix(cov_fpath, 'raw')
    if not is_non_empty_file(raw_cov_fpath):
        logger.info('  Calculating physical coverage...')
        ## keep properly mapped, unique, and non-duplicate read pairs only
        bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam')
        sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger,
                      filter_rule='proper_pair and not supplementary and not duplicate')
        ## sort by read names
        bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam')
        sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n')
        bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True)
        calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger)
    return raw_cov_fpath
Exemple #35
0
def run_bwa(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type):
    bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads)
    insert_sizes = []
    for idx, reads in enumerate(read_fpaths):
        if isinstance(reads, str):
            cmd = bwa_cmd + (' -p ' if reads_type != 'single' else ' ') + ref_fpath + ' ' + reads
        else:
            read1, read2 = reads
            cmd = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2
        output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1))
        if not is_non_empty_file(output_fpath):
            qutils.call_subprocess(shlex.split(cmd), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger)
            if reads_type == 'paired_end':
                insert_size = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath))
                if insert_size < qconfig.ideal_assembly_max_IS:
                    insert_sizes.append(insert_size)
        out_sam_fpaths.append(output_fpath)

    if insert_sizes:
        qconfig.ideal_assembly_insert_size = max(insert_sizes)
Exemple #36
0
def calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath):
    if not reads_fpaths or not sam_fpath:
        return

    lap_out_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.lap.out')
    if not is_non_empty_file(lap_out_fpath):
        if index is not None:
            logger.info('  ' + index_str + 'Running LAP...')
        else:
            logger.info('  Running LAP for reference...')
        prob_out_fpath = get_safe_fpath(output_dirpath, filename + '.prob')
        qutils.call_subprocess([lap_fpath('calc_prob.py'), '-a', fpath, '-i', ','.join(reads_fpaths), '-q', '-s', sam_fpath],
                                stdout=open(prob_out_fpath, 'w'), stderr=open(err_fpath, 'a'))
        qutils.call_subprocess([lap_fpath('sum_prob.py'), '-i', prob_out_fpath],
                                stdout=open(lap_out_fpath, 'w'), stderr=open(err_fpath, 'a'))
    else:
        if index is not None:
            logger.info('  ' + index_str + 'Using existing file with LAP score...')
        else:
            logger.info('  Using existing file with LAP score for reference...')
Exemple #37
0
def calculate_insert_size(sam_fpath, output_dir, ref_name, reads_suffix=''):
    insert_size_fpath = join(output_dir, ref_name + reads_suffix + '.is.txt')
    if is_non_empty_file(insert_size_fpath):
        try:
            with open(insert_size_fpath) as f:
                insert_size = int(f.readline())
                std_dev = int(f.readline())
            if insert_size:
                return insert_size, std_dev
        except:
            pass
    insert_sizes = []
    mapped_flags = ['99', '147', '83', '163']  # reads mapped in correct orientation and within insert size
    with open(sam_fpath) as sam_in:
        for i, l in enumerate(sam_in):
            if i > 1000000:
                break
            if l.startswith('@'):
                continue
            fs = l.split('\t')
            flag = fs[1]
            if flag not in mapped_flags:
                continue
            insert_size = abs(int(fs[8]))
            insert_sizes.append(insert_size)

    if insert_sizes:
        insert_sizes = sorted(insert_sizes)
        if len(insert_sizes) % 2 == 1:  # odd number of values
            median_is = insert_sizes[(len(insert_sizes) - 1) // 2]
        else:  # even number of values - take the avg of central
            median_is = (insert_sizes[len(insert_sizes) // 2] + insert_sizes[len(insert_sizes) // 2 - 1]) // 2
        if median_is <= 0:
            return None, None
        std_dev = sqrt(sum([(insert_size - median_is) ** 2 for insert_size in insert_sizes]) / len(insert_sizes))
        insert_size = max(qconfig.optimal_assembly_min_IS, median_is)
        with open(insert_size_fpath, 'w') as out_f:
            out_f.write(str(insert_size) + '\n')
            out_f.write(str(std_dev))
        return insert_size, std_dev
    return None, None
Exemple #38
0
def do(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, gc_fpath, features_containers, cov_fpath, output_dir, logger):
    if not exists(output_dir):
        os.makedirs(output_dir)
    conf_fpath, circos_legend_fpath = create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger)
    circos_exec = get_path_to_program('circos')
    if not circos_exec:
        logger.warning('Circos is not installed!\n'
                       'If you want to create Circos plots, install Circos as described at http://circos.ca/tutorials/lessons/configuration/distribution_and_installation '
                       'and run the following command:\n circos -conf ' + conf_fpath + '.\n '
                       'The plot annotation is saved to ' + circos_legend_fpath)
        return None, None

    cmdline = [circos_exec, '-conf', conf_fpath]
    log_fpath = join(output_dir, 'circos.log')
    err_fpath = join(output_dir, 'circos.err')
    circos_png_fpath = join(output_dir, circos_png_fname)
    return_code = qutils.call_subprocess(cmdline, stdout=open(log_fpath, 'w'), stderr=open(err_fpath, 'w'))
    if return_code == 0 and is_non_empty_file(circos_png_fpath):
        return circos_png_fpath, circos_legend_fpath
    else:
        logger.warning('  Circos diagram was not created. See ' + log_fpath + ' and ' + err_fpath + ' for details')
        return None, None
Exemple #39
0
def download_ref(organism, ref_fpath):
    ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    quast_fields = '&tool=quast&[email protected]'
    organism = organism.replace('_', '+')
    response = try_send_request(
        ncbi_url +
        'esearch.fcgi?db=assembly&term=%s+[Organism]&retmax=100' % organism +
        quast_fields)
    if not response:
        return None
    xml_tree = ET.fromstring(response)

    if xml_tree.find('Count').text == '0':  # Organism is not found
        return None

    ref_id_list = xml_tree.find('IdList').findall('Id')
    best_ref_links = []
    for id in ref_id_list:
        databases = ['assembly_nuccore_refseq', 'assembly_nuccore_insdc']
        for db in databases:
            response = try_send_request(
                ncbi_url +
                'elink.fcgi?dbfrom=assembly&db=nuccore&id=%s&linkname="%s"' %
                (id.text, db) + quast_fields)
            if not response:
                continue
            xml_tree = ET.fromstring(response)

            link_set = xml_tree.find('LinkSet')
            if link_set is None:
                continue
            link_db = xml_tree.find('LinkSet').find('LinkSetDb')
            if link_db is None:
                continue
            ref_links = link_db.findall('Link')
            if best_ref_links and len(ref_links) > len(best_ref_links):
                continue
            best_ref_links = ref_links
            if best_ref_links:
                break
        if best_ref_links and len(best_ref_links) < 3:
            break

    if not best_ref_links:
        return None

    if len(best_ref_links) > 500:
        logger.info(
            '%s has too fragmented reference genome! It will not be downloaded.'
            % organism.replace('+', ' '))
        return None

    ref_ids = sorted(link.find('Id').text for link in best_ref_links)
    is_first_piece = False
    fasta_files = []
    for ref_id in ref_ids:
        fasta = try_send_request(
            ncbi_url +
            'efetch.fcgi?db=sequences&id=%s&rettype=fasta&retmode=text' %
            ref_id)
        if fasta and fasta[0] == '>':
            fasta_files.append(fasta)
    fasta_names = [f.split('|')[-1] for f in fasta_files]
    with open(ref_fpath, "w") as fasta_file:
        for name, fasta in sorted(zip(fasta_names, fasta_files),
                                  key=natural_sort_key):
            if not is_first_piece:
                is_first_piece = True
            else:
                fasta = '\n' + fasta.rstrip()
            fasta_file.write(fasta.rstrip())

    if not os.path.isfile(ref_fpath):
        return None
    if not is_non_empty_file(ref_fpath):
        os.remove(ref_fpath)
        return None

    return ref_fpath
Exemple #40
0
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None,
                      index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'):
    filename = qutils.name_from_fpath(fpath)
    if not sam_fpath and bam_fpath:
        sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam')
    else:
        sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam')
    bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam')
    if using_reads != 'all':
        sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam')
        bam_fpath = sam_fpath.replace('.sam', '.bam')
    if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)):
        required_files.append(sam_fpath)

    stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat')
    index_str = qutils.index_to_str(index) if index is not None else ''

    reads_fpaths = qconfig.reads_fpaths
    correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    can_reuse = correct_chr_names is not None
    if not can_reuse and not reads_fpaths:
        return None, None, None
    if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)):
        if not alignment_only:
            if isfile(stats_fpath):
                logger.info('  ' + index_str + 'Using existing flag statistics file ' + stats_fpath)
            elif isfile(bam_fpath):
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath],
                                       stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a'))
                analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger)
        if isfile(stats_fpath) or alignment_only:
            return correct_chr_names, sam_fpath, bam_fpath

    logger.info('  ' + index_str + 'Pre-processing reads...')
    if is_non_empty_file(sam_fpath) and can_reuse:
        logger.info('  ' + index_str + 'Using existing SAM-file: ' + sam_fpath)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    elif is_non_empty_file(bam_fpath) and can_reuse:
        logger.info('  ' + index_str + 'Using existing BAM-file: ' + bam_fpath)
        sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths:
        if is_reference:
            logger.info('  Running BWA for reference...')
        else:
            logger.info('  ' + index_str + 'Running BWA...')
        # use absolute paths because we will change workdir
        fpath = abspath(fpath)
        sam_fpath = abspath(sam_fpath)

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        bwa_index(fpath, err_fpath, logger)
        sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads)

        if len(sam_fpaths) > 1:
            merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath)
        elif len(sam_fpaths) == 1:
            shutil.move(sam_fpaths[0], sam_fpath)
            tmp_bam_fpath = sam_fpaths[0].replace('.sam', '.bam')
            if is_non_empty_file(tmp_bam_fpath):
                shutil.move(tmp_bam_fpath, bam_fpath)

        logger.info('  ' + index_str + 'Done.')
        os.chdir(prev_dir)
        if not is_non_empty_file(sam_fpath):
            logger.error('  Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.')
            return None, None, None
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)

    elif not correct_chr_names or not is_non_empty_file(sam_fpath):
        return None, None, None
    if is_reference:
        logger.info('  Sorting SAM-file for reference...')
    else:
        logger.info('  ' + index_str + 'Sorting SAM-file...')

    if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath):
        logger.info('  ' + index_str + 'Using existing BAM-file: ' + bam_fpath)
    else:
        correct_sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.correct.sam')  # write in output dir
        sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath)
        sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)

    qutils.assert_file_exists(bam_fpath, 'bam file')
    if not alignment_only:
        if isfile(stats_fpath):
            logger.info('  ' + index_str + 'Using existing flag statistics file ' + stats_fpath)
        elif isfile(bam_fpath):
            qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath],
                                    stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a'))
            analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger)
        if is_reference:
            logger.info('  Analysis for reference is finished.')
        else:
            logger.info('  ' + index_str + 'Analysis is finished.')
    return correct_chr_names, sam_fpath, bam_fpath
Exemple #41
0
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir,
                         log_path, err_fpath):
    required_files = []
    bed_fpath, cov_fpath, physical_cov_fpath = None, None, None
    if main_ref_fpath:
        ref_name = qutils.name_from_fpath(main_ref_fpath)

        bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed')
        cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
        physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov')
        required_files = [bed_fpath, cov_fpath, physical_cov_fpath]

        if qconfig.no_sv:
            logger.info('  Will not search Structural Variations (--fast or --no-sv is specified)')
            bed_fpath = None
        elif is_non_empty_file(bed_fpath):
            logger.info('  Using existing BED-file: ' + bed_fpath)
        elif not qconfig.forward_reads and not qconfig.interlaced_reads:
            if not qconfig.reference_sam and not qconfig.reference_bam:
                logger.info('  Will not search Structural Variations (needs paired-end reads)')
                bed_fpath = None
                qconfig.no_sv = True
        if qconfig.create_icarus_html:
            if is_non_empty_file(cov_fpath):
                is_correct_file = check_cov_file(cov_fpath)
                if is_correct_file:
                    logger.info('  Using existing reads coverage file: ' + cov_fpath)
            if is_non_empty_file(physical_cov_fpath):
                logger.info('  Using existing physical coverage file: ' + physical_cov_fpath)
        else:
            logger.info('  Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)')
            cov_fpath = None
            physical_cov_fpath = None
        if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \
                (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))):
            required_files = []

    n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1)
    max_threads_per_job = max(1, qconfig.max_threads // n_jobs)
    sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths)
    bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths)
    parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job,
                            sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)]
    if main_ref_fpath:
        parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                    max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True))
    correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs)
    qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)]
    qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)]
    add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath)
    save_reads(output_dir)
    if not main_ref_fpath:
        return None, None, None

    correct_chr_names = correct_chr_names[-1]
    sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1]
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath
    if not required_files:
        return bed_fpath, cov_fpath, physical_cov_fpath
    if not all([sam_fpath, bam_fpath]):
        logger.info('  Failed searching structural variations.')
        return None, None, None

    sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted'))
    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(sam_sorted_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        if not is_non_empty_file(bam_sorted_fpath):
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger)
    if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)):
        cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath,
                                                     log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_lengths = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_lengths[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        ref_files = {}
        if meta_ref_fpaths:
            global ref_sam_fpaths
            for cur_ref_fpath in meta_ref_fpaths:
                cur_ref_name = qutils.name_from_fpath(cur_ref_fpath)
                ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam')
                ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath
                if is_non_empty_file(ref_sam_fpath):
                    logger.info('    Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath))
                    ref_files[cur_ref_name] = None
                else:
                    ref_sam_file = open(ref_sam_fpath, 'w')
                    if not headers[0].startswith('@SQ'):
                        ref_sam_file.write(headers[0] + '\n')
                    for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name:
                            ref_sam_file.write(h + '\n')
                    ref_sam_file.write(headers[-1] + '\n')
                    ref_files[cur_ref_name] = ref_sam_file
                    need_ref_splitting = True

        trivial_deletions_fpath = \
            search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting)
        if get_gridss_fpath() and isfile(get_gridss_fpath()):
            try:
                gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath)
                qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath)
            except:
                pass
        if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info('  Coverage distribution along the reference genome is in ' + cov_fpath)
    else:
        if not qconfig.create_icarus_html:
            logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath
Exemple #42
0
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath):
    from quast_libs import reporting

    ref_reads_stats = None
    ref_lap_score = None
    if ref_fpath:
        ref_name = qutils.name_from_fpath(ref_fpath)
        stats_fpath = join(output_dir, ref_name + '.stat')
        if isfile(stats_fpath):
            ref_reads_stats = parse_reads_stats(stats_fpath)
            if int(ref_reads_stats['mapped']) == 0:
                logger.info('  BWA: nothing aligned for reference.')
        lap_out_fpath = get_safe_fpath(output_dir, ref_name + '.lap.out')
        if is_non_empty_file(lap_out_fpath):
            with open(lap_out_fpath) as f:
                l = f.readline()
                ref_lap_score = float(l.split()[0]) if l else None

    # process all contigs files
    for index, contigs_fpath in enumerate(contigs_fpaths):
        report = reporting.get(contigs_fpath)
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)
        stats_fpath = join(output_dir, assembly_name + '.stat')
        if ref_reads_stats:
            report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped'])
            report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired'])
            report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt'])
            report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons'])
            report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint'])
            report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt'])
            report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth'])
            if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
                report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS,
                                [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
                report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0])
        if not isfile(stats_fpath):
            continue
        reads_stats = parse_reads_stats(stats_fpath)
        report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total'])
        report.add_field(reporting.Fields.LEFT_READS, reads_stats['left'])
        report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right'])
        report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped'])
        report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired'])
        report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt'])
        if int(reads_stats['mapped']) == 0:
            logger.info('  ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.')
        report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons'])
        report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt'])
        report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint'])
        report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt'])
        report.add_field(reporting.Fields.DEPTH, reads_stats['depth'])
        if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds):
            report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS,
                            [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)])
            report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0])

        lap_out_fpath = get_safe_fpath(output_dir, assembly_name + '.lap.out')
        if is_non_empty_file(lap_out_fpath):
            with open(lap_out_fpath) as f:
                l = f.readline()
                lap_score = float(l.split()[0]) if l else None
            report.add_field(reporting.Fields.LAP_SCORE, ('%.3f' % lap_score if lap_score is not None else None))
        report.add_field(reporting.Fields.REF_LAP_SCORE, ('%.3f' % ref_lap_score if ref_lap_score is not None else None))
Exemple #43
0
def download_refs(organism, ref_fpath):
    ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
    quast_fields = '&tool=quast&[email protected]'
    organism = organism.replace('_', '+')
    response = try_send_request(ncbi_url + 'esearch.fcgi?db=assembly&term=%s+[Organism]&retmax=100' % organism + quast_fields)
    if not response:
        return None
    xml_tree = ET.fromstring(response)

    if xml_tree.find('Count').text == '0':  # Organism is not found
        return None

    ref_id_list = xml_tree.find('IdList').findall('Id')
    best_ref_links = []
    for id in ref_id_list:
        databases = ['assembly_nuccore_refseq', 'assembly_nuccore_insdc']
        for db in databases:
            response = try_send_request(
                ncbi_url + 'elink.fcgi?dbfrom=assembly&db=nuccore&id=%s&linkname="%s"' % (id.text, db) + quast_fields)
            if not response:
                continue
            xml_tree = ET.fromstring(response)

            link_set = xml_tree.find('LinkSet')
            if link_set is None:
                continue
            link_db = xml_tree.find('LinkSet').find('LinkSetDb')
            if link_db is None:
                continue
            ref_links = link_db.findall('Link')
            if best_ref_links and len(ref_links) > len(best_ref_links):
                continue
            best_ref_links = ref_links
            if best_ref_links:
                break
        if best_ref_links and len(best_ref_links) < 3:
            break

    if not best_ref_links:
        return None

    if len(best_ref_links) > 500:
        logger.info('%s has too fragmented reference genome! It will not be downloaded.' % organism.replace('+', ' '))
        return None

    ref_ids = sorted(link.find('Id').text for link in best_ref_links)
    is_first_piece = False
    fasta_files = []
    for ref_id in ref_ids:
        fasta = try_send_request(ncbi_url + 'efetch.fcgi?db=sequences&id=%s&rettype=fasta&retmode=text' % ref_id)
        if fasta and fasta[0] == '>':
            fasta_files.append(fasta)
    fasta_names = [f.split('|')[-1] for f in fasta_files]
    with open(ref_fpath, "w") as fasta_file:
        for name, fasta in sorted(zip(fasta_names, fasta_files), key=natural_sort_key):
            if not is_first_piece:
                is_first_piece = True
            else:
                fasta = '\n' + fasta.rstrip()
            fasta_file.write(fasta.rstrip())

    if not os.path.isfile(ref_fpath):
        return None
    if not is_non_empty_file(ref_fpath):
        os.remove(ref_fpath)
        return None

    return ref_fpath
Exemple #44
0
def bwa_index(ref_fpath, err_path, logger):
    cmd = [bwa_fpath('bwa'), 'index', '-p', ref_fpath, ref_fpath]
    if getsize(ref_fpath) > 2 * 1024 ** 3:  # if reference size bigger than 2GB
        cmd += ['-a', 'bwtsw']
    if not is_non_empty_file(ref_fpath + '.bwt'):
        qutils.call_subprocess(cmd, stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
Exemple #45
0
def main(in_fpath, out_fname):
    """
    This function runs a BUSCO analysis according to the provided parameters.
    See the help for more details:
    ``python run_BUSCO.py -h``
    :raises SystemExit: if any errors occur
    """
    start_time = time.time()
    # 1) Load a busco config file that will figure out all the params from all sources
    # i.e. provided config file, dataset cfg, and user args
    if os.environ.get('BUSCO_CONFIG_FILE') and os.access(os.environ.get('BUSCO_CONFIG_FILE'), os.R_OK):
        config_file = os.environ.get('BUSCO_CONFIG_FILE')
    else:
        config_file = '%s//config.ini.default' % os.path.dirname(os.path.realpath(__file__))
    config = BuscoConfig(config_file, args={'in': in_fpath, 'out': out_fname})
    # Define a logger, the config is passed to tell the logger if you required the quiet mode

    assembly_dirpath = os.path.join(config.get('busco', 'out_path'), 'run_%s' % out_fname)
    if not isdir(assembly_dirpath):
        os.makedirs(assembly_dirpath)
    summary_path = os.path.join(assembly_dirpath, 'short_summary_%s.txt' % out_fname)

    from quast_libs.busco import pipebricks
    pipebricks.PipeLogger.run_dirpath = assembly_dirpath
    from quast_libs.busco.GenomeAnalysis import GenomeAnalysis
    from quast_libs.busco.BuscoAnalysis import BuscoAnalysis
    from quast_libs.busco.pipebricks.Toolset import ToolException
    BuscoAnalysis._logger.reload_log()
    logger = BuscoAnalysis._logger
    if is_non_empty_file(summary_path):
        logger.info('Using existing BUSCO files for ' + out_fname + '...')
        return summary_path

    try:
        try:
            logger.info(
                '****************** Start a BUSCO %s analysis, current time: %s **'
                '****************' % (BuscoConfig.VERSION, time.strftime('%m/%d/%Y %H:%M:%S')))
            logger.info('Configuration loaded from %s' % config_file)
            # 2) Load the analysis, this will check the dependencies and return the appropriate analysis object
            analysis = GenomeAnalysis(config)

            # 3) Run the analysis
            analysis.run_analysis()

            if not logger.has_warning():
                logger.info('BUSCO analysis done. Total running time: %s seconds' % str(time.time() - start_time))
            else:
                logger.info('BUSCO analysis done with WARNING(s). Total running time: %s seconds'
                            % str(time.time() - start_time))

            logger.info('Results written in %s\n' % analysis.mainout)

        except ToolException as e:
            #
            logger.error(e)
            raise SystemExit

    except SystemExit:
        logger.error('BUSCO analysis failed !')
        logger.error(
            'Check the logs, read the user guide, if you still need technical '
            'support, then please contact %s\n' % BuscoConfig.CONTACT)
        raise SystemExit

    except KeyboardInterrupt:
        logger.error('A signal was sent to kill the process')
        logger.error('BUSCO analysis failed !')
        logger.error(
            'Check the logs, read the user guide, if you still need technical '
            'support, then please contact %s\n' % BuscoConfig.CONTACT)
        raise SystemExit

    except BaseException:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        logger.critical('Unhandled exception occurred: %s\n' % traceback.format_exception(
            exc_type, exc_value, exc_traceback))
        logger.error('BUSCO analysis failed !')
        logger.error(
            'Check the logs, read the user guide, if you still need technical '
            'support, then please contact %s\n' % BuscoConfig.CONTACT)
        raise SystemExit
    return summary_path
Exemple #46
0
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None,
                      index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'):
    filename = qutils.name_from_fpath(fpath)
    if not sam_fpath and bam_fpath:
        sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam')
    else:
        sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam')
    bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam')
    if using_reads != 'all':
        sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam')
        bam_fpath = sam_fpath.replace('.sam', '.bam')
    if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)):
        required_files.append(sam_fpath)

    stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat')
    index_str = qutils.index_to_str(index) if index is not None else ''

    reads_fpaths = qconfig.reads_fpaths
    correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    can_reuse = correct_chr_names is not None
    if not can_reuse and not reads_fpaths:
        return None, None, None
    if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)):
        if not alignment_only:
            if isfile(stats_fpath):
                logger.info('  ' + index_str + 'Using existing flag statistics file ' + stats_fpath)
            elif isfile(bam_fpath):
                qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath],
                                       stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a'))
                analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger)
            calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath)
        if isfile(stats_fpath) or alignment_only:
            return correct_chr_names, sam_fpath, bam_fpath

    logger.info('  ' + index_str + 'Pre-processing reads...')
    if is_non_empty_file(sam_fpath) and can_reuse:
        logger.info('  ' + index_str + 'Using existing SAM-file: ' + sam_fpath)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    elif is_non_empty_file(bam_fpath) and can_reuse:
        logger.info('  ' + index_str + 'Using existing BAM-file: ' + bam_fpath)
        sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger)
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths:
        if is_reference:
            logger.info('  Running BWA for reference...')
        else:
            logger.info('  ' + index_str + 'Running BWA...')
        # use absolute paths because we will change workdir
        fpath = abspath(fpath)
        sam_fpath = abspath(sam_fpath)

        prev_dir = os.getcwd()
        os.chdir(output_dirpath)
        bwa_index(fpath, err_fpath, logger)
        sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads)

        if len(sam_fpaths) > 1:
            merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, main_output_dir, max_threads, err_fpath)
        elif len(sam_fpaths) == 1:
            shutil.move(sam_fpaths[0], sam_fpath)
            sambamba_view(sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)

        logger.info('  ' + index_str + 'Done.')
        os.chdir(prev_dir)
        if not is_non_empty_file(sam_fpath):
            logger.error('  Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.')
            return None, None, None
        correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference)
    elif not correct_chr_names or not is_non_empty_file(sam_fpath):
        return None, None, None
    if is_reference:
        logger.info('  Sorting SAM-file for reference...')
    else:
        logger.info('  ' + index_str + 'Sorting SAM-file...')

    if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath):
        logger.info('  ' + index_str + 'Using existing BAM-file: ' + bam_fpath)
    else:
        correct_sam_fpath = join(output_dirpath, filename + '.correct.sam')  # write in output dir
        sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath)
        sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None)

    qutils.assert_file_exists(bam_fpath, 'bam file')
    if not alignment_only:
        if isfile(stats_fpath):
            logger.info('  ' + index_str + 'Using existing flag statistics file ' + stats_fpath)
        elif isfile(bam_fpath):
            qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath],
                                    stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a'))
            analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger)
        calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath)
        if is_reference:
            logger.info('  Analysis for reference is finished.')
        else:
            logger.info('  ' + index_str + 'Analysis is finished.')
    return correct_chr_names, sam_fpath, bam_fpath
Exemple #47
0
def download_ref(organism, ref_fpath, max_ref_fragments):
    organism = organism.replace('_', '+')
    isolate = ''
    strain = ''
    if '+isolate+' in organism:
        organism, isolate = organism.split('+isolate+')
    if '+strain+' in organism:
        organism, strain = organism.split('+strain+')

    response = try_send_request(
        ncbi_url +
        'esearch.fcgi?db=assembly&term=%s+[Organism]%s%s&retmax=100%s' %
        (organism, (isolate + '+[Isolate]') if isolate else '',
         (strain + '+[Strain]') if strain else '', quast_fields))
    if not response:
        return None
    xml_tree = ET.fromstring(response)

    if xml_tree.find('Count').text == '0':  # Organism is not found
        return None

    ref_id_list = xml_tree.find('IdList').findall('Id')
    best_ref_links = get_download_links(
        ref_id_list, "assembly_nuccore_refseq+OR+assembly_nuccore_insdc")
    used_db = "refseq"
    if not best_ref_links:
        used_db = "wgsmaster"
        best_ref_links = get_download_links(ref_id_list,
                                            "assembly_nuccore_wgsmaster")

    if len(best_ref_links) > max_ref_fragments:
        logger.info(
            '%s has too fragmented reference genome! It will not be downloaded.'
            % organism.replace('+', ' '))
        return None

    if used_db == "refseq" and best_ref_links:
        ref_ids = sorted(link.find('Id').text for link in best_ref_links)
        is_first_piece = False
        fasta_files = []
        chunk_size = 200
        for i in range(0, len(ref_ids), chunk_size):
            fasta = try_send_request(
                ncbi_url +
                'efetch.fcgi?db=sequences&id=%s&rettype=fasta&retmode=text' %
                ','.join(ref_ids[i:i + chunk_size]))
            if fasta and fasta[0] == '>':
                fasta_files.extend(fasta.rstrip().split('\n\n'))
        fasta_names = [f.split(' ')[0] for f in fasta_files]
        with open(ref_fpath, "w") as fasta_file:
            for name, fasta in sorted(zip(fasta_names, fasta_files),
                                      key=natural_sort_key):
                if not is_first_piece:
                    is_first_piece = True
                else:
                    fasta = '\n' + fasta.rstrip()
                fasta_file.write(fasta.rstrip())
    elif best_ref_links:  ## download WGS assembly
        try:
            download_wgsmaster_contigs(best_ref_links[0].find('Id').text,
                                       ref_fpath)
        except:
            logger.info('Failed downloading %s!' % organism.replace('+', ' '))

    if not os.path.isfile(ref_fpath):
        return None
    if not is_non_empty_file(ref_fpath):
        os.remove(ref_fpath)
        return None

    return ref_fpath
Exemple #48
0
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir,
                         log_path, err_fpath):
    required_files = []
    bed_fpath, cov_fpath, physical_cov_fpath = None, None, None
    if main_ref_fpath:
        ref_name = qutils.name_from_fpath(main_ref_fpath)

        bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed')
        cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov')
        physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov')
        required_files = [bed_fpath, cov_fpath, physical_cov_fpath]

        if qconfig.no_sv:
            logger.info('  Will not search Structural Variations (--fast or --no-sv is specified)')
            bed_fpath = None
        elif is_non_empty_file(bed_fpath):
            logger.info('  Using existing BED-file: ' + bed_fpath)
        elif not qconfig.forward_reads and not qconfig.interlaced_reads:
            if not qconfig.reference_sam and not qconfig.reference_bam:
                logger.info('  Will not search Structural Variations (needs paired-end reads)')
                bed_fpath = None
                qconfig.no_sv = True
        if qconfig.create_icarus_html:
            if is_non_empty_file(cov_fpath):
                is_correct_file = check_cov_file(cov_fpath)
                if is_correct_file:
                    logger.info('  Using existing reads coverage file: ' + cov_fpath)
            if is_non_empty_file(physical_cov_fpath):
                logger.info('  Using existing physical coverage file: ' + physical_cov_fpath)
        else:
            logger.info('  Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)')
            cov_fpath = None
            physical_cov_fpath = None
        if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \
                (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))):
            required_files = []

    n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1)
    max_threads_per_job = max(1, qconfig.max_threads // n_jobs)
    sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths)
    bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths)
    parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job,
                            sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)]
    if main_ref_fpath:
        parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath,
                                    max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True))
    correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs)
    qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)]
    qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)]
    add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath)
    save_reads(output_dir)
    if not main_ref_fpath:
        return None, None, None

    correct_chr_names = correct_chr_names[-1]
    sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1]
    qconfig.reference_sam = sam_fpath
    qconfig.reference_bam = bam_fpath
    if not required_files:
        return bed_fpath, cov_fpath, physical_cov_fpath
    if not all([sam_fpath, bam_fpath]):
        logger.info('  Failed searching structural variations.')
        return None, None, None

    sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted'))
    bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped'))
    bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted'))

    if is_non_empty_file(sam_sorted_fpath):
        logger.info('  Using existing sorted SAM-file: ' + sam_sorted_fpath)
    else:
        if not is_non_empty_file(bam_sorted_fpath):
            sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger,  filter_rule='not unmapped')
            sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger)
        sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger)
    if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)):
        cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath,
                                                     log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath)
    if not is_non_empty_file(bed_fpath) and not qconfig.no_sv:
        if meta_ref_fpaths:
            logger.info('  Splitting SAM-file by references...')
        headers = []
        seq_lengths = {}
        with open(sam_fpath) as sam_file:
            for line in sam_file:
                if not line.startswith('@'):
                    break
                if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line:
                    seq_name = line.split('\tSN:')[1].split('\t')[0]
                    seq_length = int(line.split('\tLN:')[1].split('\t')[0])
                    seq_lengths[seq_name] = seq_length
                headers.append(line.strip())
        need_ref_splitting = False
        ref_files = {}
        if meta_ref_fpaths:
            global ref_sam_fpaths
            for cur_ref_fpath in meta_ref_fpaths:
                cur_ref_name = qutils.name_from_fpath(cur_ref_fpath)
                ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam')
                ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath
                if is_non_empty_file(ref_sam_fpath):
                    logger.info('    Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath))
                    ref_files[cur_ref_name] = None
                else:
                    ref_sam_file = open(ref_sam_fpath, 'w')
                    if not headers[0].startswith('@SQ'):
                        ref_sam_file.write(headers[0] + '\n')
                    for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h):
                        seq_name = h.split('\tSN:')[1].split('\t')[0]
                        if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name:
                            ref_sam_file.write(h + '\n')
                    ref_sam_file.write(headers[-1] + '\n')
                    ref_files[cur_ref_name] = ref_sam_file
                    need_ref_splitting = True

        trivial_deletions_fpath = \
            search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting)
        if get_gridss_fpath() and isfile(get_gridss_fpath()):
            try:
                gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath)
                qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath)
            except:
                pass
        if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath):
            shutil.copy(trivial_deletions_fpath, bed_fpath)

    if not qconfig.no_sv:
        if is_non_empty_file(bed_fpath):
            logger.main_info('  Structural variations are in ' + bed_fpath)
        else:
            if isfile(bed_fpath):
                logger.main_info('  No structural variations were found.')
            else:
                logger.main_info('  Failed searching structural variations.')
            bed_fpath = None
    if is_non_empty_file(cov_fpath):
        logger.main_info('  Coverage distribution along the reference genome is in ' + cov_fpath)
    else:
        if not qconfig.create_icarus_html:
            logger.main_info('  Failed to calculate coverage distribution')
        cov_fpath = None
    return bed_fpath, cov_fpath, physical_cov_fpath
Exemple #49
0
def download_blastdb(logger=logger, only_clean=False):
    global blastdb_dirpath
    blastdb_dirpath = get_dir_for_download('silva', 'Silva', [silva_downloaded_fname + '.nsq'], logger, only_clean=only_clean)
    if not blastdb_dirpath:
        return False

    if only_clean:
        if os.path.isdir(blastdb_dirpath):
            logger.info('Removing ' + blastdb_dirpath)
            shutil.rmtree(blastdb_dirpath)
        return True

    global db_fpath
    db_fpath = join(blastdb_dirpath, silva_downloaded_fname)
    if os.path.isfile(db_fpath + '.nsq') and os.path.getsize(db_fpath + '.nsq') >= db_nsq_fsize:
        return True
    log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log')
    db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz')
    silva_fpath = os.path.join(blastdb_dirpath, silva_fname)

    logger.info()

    if os.path.isfile(db_gz_fpath):
        logger.info('SILVA 16S ribosomal RNA gene database has already been downloaded.')
    else:
        logger.info('Downloading SILVA 16S ribosomal RNA gene database...')
        if not os.path.isdir(blastdb_dirpath):
            os.makedirs(blastdb_dirpath)
        silva_download = urllib.FancyURLopener()
        silva_remote_fpath = silva_db_url + silva_fname + '.gz'
        silva_download_in_progress_path = db_gz_fpath + '.download'
        try:
            silva_download.retrieve(silva_remote_fpath, silva_download_in_progress_path, show_progress)
            if not qutils.is_non_empty_file(silva_download_in_progress_path, min_size=1024*1024):
                raise ValueError
        except Exception:
            logger.error(
                'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. '
                'Try to download it manually, put under %s/ and restart your command.' % (silva_remote_fpath, blastdb_dirpath))
            return False
        shutil.move(silva_download_in_progress_path, db_gz_fpath)

    logger.info('Processing downloaded file. Logging to %s...' % log_fpath)
    if not qutils.is_non_empty_file(silva_fpath):
        logger.info('Unpacking and replacing " " with "_"...')

        unpacked_fpath = silva_fpath + ".unpacked"
        cmd = "gunzip -c %s" % db_gz_fpath
        qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger)

        substituted_fpath = silva_fpath + ".substituted"
        with open(unpacked_fpath) as in_file:
            with open(substituted_fpath, 'w') as out_file:
                for line in in_file:
                    out_file.write(line.replace(' ', '_'))
        os.remove(unpacked_fpath)
        shutil.move(substituted_fpath, silva_fpath)

    logger.info('Making BLAST database...')
    cmd = get_blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath))
    qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger)
    if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize:
        logger.error('Failed to make BLAST database ("' + blastdb_dirpath +
                     '"). See details in log. Try to make it manually: %s' % cmd)
        return False
    elif not qconfig.debug:
        os.remove(db_gz_fpath)
        os.remove(silva_fpath)
    return True
Exemple #50
0
def do(output_dir, ref_fpath, contigs_fpaths, logger):
    logger.print_timestamp()
    logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...')

    checked_assemblies = []
    for contigs_fpath in contigs_fpaths:
        label = qutils.label_from_fpath_for_fname(contigs_fpath)
        if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath):
            kmc_stats_fpath = join(output_dir, label + '.stat')
            stats_content = open(kmc_stats_fpath).read().split('\n')
            if len(stats_content) < 1:
                continue
            logger.info('  Using existing results for ' + label + '... ')
            report = reporting.get(contigs_fpath)
            report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1]))
            if len(stats_content) >= 5:
                len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1])
                len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1])
                len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1])
                total_len = int(stats_content[4].strip().split(': ')[-1])
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
                report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))
            checked_assemblies.append(contigs_fpath)

    contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies]
    if len(contigs_fpaths) == 0:
        logger.info('Done.')
        return

    if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath):
        logger.warning('  Sorry, can\'t run KMC on this platform, skipping...')
        return None

    logger.info('Running KMC on reference...')
    log_fpath = join(output_dir, 'kmc.log')
    err_fpath = join(output_dir, 'kmc.err')
    open(log_fpath, 'w').close()
    open(err_fpath, 'w').close()

    tmp_dirpath = join(output_dir, 'tmp')
    if not isdir(tmp_dirpath):
        os.makedirs(tmp_dirpath)
    ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath)
    unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath)
    if not unique_kmers:
        return

    logger.info('Analyzing assemblies completeness...')
    kmc_out_fpaths = []
    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath)
        intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath)
        matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath)
        completeness = matched_kmers * 100.0 / unique_kmers
        report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness)
        kmc_out_fpaths.append(intersect_out_fpath)

    logger.info('Analyzing assemblies accuracy...')
    if len(kmc_out_fpaths) > 1:
        shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath)
    else:
        shared_kmc_db = kmc_out_fpaths[0]

    kmer_fraction = 100 if getsize(ref_fpath) < 500 * 1024 ** 2 else 1000

    shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction)

    shared_kmers_by_chrom = dict()
    shared_kmers_fpath = join(tmp_dirpath, 'shared_kmers.txt')
    ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath))
    with open(shared_kmers_fpath, 'w') as out_f:
        for name, seq in ref_contigs.items():
            seq_kmers = get_string_kmers(tmp_dirpath, log_fpath, err_fpath, seq=seq, intersect_with=shared_downsampled_kmc_db)
            for kmer_i, kmer in enumerate(seq_kmers):
                shared_kmers_by_chrom[str(kmer)] = name
                out_f.write('>' + str(kmer_i) + '\n')
                out_f.write(kmer + '\n')

    shared_kmc_db = count_kmers(tmp_dirpath, shared_kmers_fpath, log_fpath, err_fpath)
    ref_kmc_dbs = []
    for ref_name, ref_seq in ref_contigs.items():
        ref_contig_fpath = join(tmp_dirpath, ref_name + '.fa')
        if not is_non_empty_file(ref_contig_fpath):
            with open(ref_contig_fpath, 'w') as out_f:
                out_f.write(ref_seq)
        ref_kmc_db = count_kmers(tmp_dirpath, ref_contig_fpath, log_fpath, err_fpath)
        ref_shared_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, shared_kmc_db], log_fpath, err_fpath)
        ref_kmc_dbs.append((ref_name, ref_shared_kmc_db))

    for contigs_fpath in contigs_fpaths:
        report = reporting.get(contigs_fpath)
        len_map_to_one_chrom = None
        len_map_to_multi_chrom = None
        len_map_to_none_chrom = None
        total_len = 0
        long_contigs = []
        contig_lens = dict()
        contig_markers = defaultdict(list)
        for name, seq in read_fasta(contigs_fpath):
            total_len += len(seq)
            contig_lens[name] = len(seq)
            if len(seq) >= MIN_CONTIGS_LEN:
                long_contigs.append(len(seq))

        if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5:
            logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.')
        elif len(ref_kmc_dbs) > MAX_CONTIGS_NUM:
            logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.')
        else:
            len_map_to_one_chrom = 0
            len_map_to_multi_chrom = 0
            for name, seq in read_fasta(contigs_fpath):
                if len(seq) < MIN_CONTIGS_LEN:
                    continue

                tmp_contig_fpath = join(tmp_dirpath, name + '.fa')
                with open(tmp_contig_fpath, 'w') as out_tmp_f:
                    out_tmp_f.write(seq)
                contig_kmc_db = count_kmers(tmp_dirpath, tmp_contig_fpath, log_fpath, err_fpath)
                intersect_all_ref_kmc_db = intersect_kmers(tmp_dirpath, [contig_kmc_db, shared_kmc_db], log_fpath, err_fpath)
                kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_all_ref_kmc_db, log_fpath, err_fpath)
                if kmers_cnt < MIN_MARKERS:
                    continue
                for ref_name, ref_kmc_db in ref_kmc_dbs:
                    intersect_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, intersect_all_ref_kmc_db], log_fpath, err_fpath)
                    kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_kmc_db, log_fpath, err_fpath)
                    if kmers_cnt:
                        contig_markers[name].append(ref_name)
            for name, chr_markers in contig_markers.items():
                if len(chr_markers) == 1:
                    len_map_to_one_chrom += contig_lens[name]
                else:
                    len_map_to_multi_chrom += contig_lens[name]
            len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len))
            report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len))

        create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath,
                             report.get_field(reporting.Fields.KMER_COMPLETENESS),
                             len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len)

    if not qconfig.debug:
        shutil.rmtree(tmp_dirpath)
    logger.info('Done.')