def create_housekeeping_file(chr_lengths, max_points, root_dir, output_dir, logger): max_ideograms = len(chr_lengths.keys()) template_fpath = None circos_bin_fpath = get_path_to_program('circos') if circos_bin_fpath: circos_dirpath = dirname(realpath(get_path_to_program('circos'))) template_fpath = join(circos_dirpath, '..', 'libexec', 'etc', 'housekeeping.conf') if not is_non_empty_file(template_fpath): template_fpath = join(circos_dirpath, '..', 'etc', 'housekeeping.conf') if not is_non_empty_file(template_fpath): if not get_path_to_program('circos'): msg = 'Circos is not found.' else: msg = 'File etc/housekeeping.conf is not found.' logger.warning(msg + ' You will have to manually edit etc/housekeeping.conf: ' 'set max_points_per_track to ' + str(max_points) + ' and max_ideograms to ' + str(max_ideograms)) return '<<include %s>>\n' % join('etc', 'housekeeping.conf') housekeeping_fpath = join(output_dir, 'housekeeping.conf') with open(template_fpath) as f: with open(housekeeping_fpath, 'w') as out_f: for line in f: if 'max_points_per_track' in line: out_f.write('max_points_per_track = %d\n' % max_points) elif 'max_ideograms' in line: out_f.write('max_ideograms = %d\n' % max_ideograms) else: out_f.write(line) return '<<include %s>>\n' % relpath(housekeeping_fpath, root_dir)
def prepare_regular_quast_args(quast_py_args, combined_output_dirpath): opts_with_args_to_remove = ['--contig-thresholds', '--sv-bed',] opts_to_remove = ['-s', '--scaffolds', '--combined-ref'] for opt in opts_with_args_to_remove: remove_from_quast_py_args(quast_py_args, opt, arg=True) for opt in opts_to_remove: remove_from_quast_py_args(quast_py_args, opt) quast_py_args += ['--no-check-meta'] qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold >= qconfig.min_contig]) if not qconfig.contig_thresholds: qconfig.contig_thresholds = 'None' quast_py_args += ['--contig-thresholds'] quast_py_args += [qconfig.contig_thresholds] reads_stats_dirpath = os.path.join(combined_output_dirpath, qconfig.reads_stats_dirname) reference_name = qutils.name_from_fpath(qconfig.combined_ref_name) qconfig.bed = qconfig.bed or os.path.join(reads_stats_dirpath, reference_name + '.bed') qconfig.cov_fpath = qconfig.cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.cov') qconfig.phys_cov_fpath = qconfig.phys_cov_fpath or os.path.join(reads_stats_dirpath, reference_name + '.physical.cov') if qconfig.bed and is_non_empty_file(qconfig.bed): quast_py_args += ['--sv-bed'] quast_py_args += [qconfig.bed] if qconfig.cov_fpath and is_non_empty_file(qconfig.cov_fpath): quast_py_args += ['--cov'] quast_py_args += [qconfig.cov_fpath] if qconfig.phys_cov_fpath and is_non_empty_file(qconfig.phys_cov_fpath): quast_py_args += ['--phys-cov'] quast_py_args += [qconfig.phys_cov_fpath]
def bam_to_bed(output_dirpath, name, bam_fpath, err_path, logger, bedpe=False): raw_bed_fpath = join(output_dirpath, name + '.bed') if bedpe: bedpe_fpath = join(output_dirpath, name + '.bedpe') if not is_non_empty_file(bedpe_fpath) and not is_non_empty_file( bedpe_fpath): qutils.call_subprocess( [bedtools_fpath('bamToBed'), '-i', bam_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() start, end = fs[1], fs[5] bed_file.write('\t'.join([fs[0], start, end + '\n'])) else: if not is_non_empty_file(raw_bed_fpath): qutils.call_subprocess( [bedtools_fpath('bamToBed'), '-i', bam_fpath], stdout=open(raw_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) sorted_bed_fpath = join(output_dirpath, name + '.sorted.bed') if not is_non_empty_file(sorted_bed_fpath): qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return sorted_bed_fpath
def count_kmers(tmp_dirpath, fpath, log_fpath, err_fpath, can_reuse=True): kmc_out_fpath = join(tmp_dirpath, basename(fpath) + '.kmc') if can_reuse and is_non_empty_file(kmc_out_fpath + '.kmc_pre') and is_non_empty_file(kmc_out_fpath + '.kmc_suf'): return kmc_out_fpath max_mem = max(2, get_total_memory() // 4) run_kmc(kmc_bin_fpath, ['-m' + str(max_mem), '-k' + str(KMERS_LEN), '-fm', '-cx1', '-ci1', fpath, kmc_out_fpath, tmp_dirpath], log_fpath, err_fpath) return kmc_out_fpath
def count_kmers(tmp_dirpath, fpath, log_fpath, err_fpath, can_reuse=True): kmc_out_fpath = join(tmp_dirpath, basename(fpath) + '.kmc') if can_reuse and is_non_empty_file(kmc_out_fpath + '.kmc_pre') and is_non_empty_file(kmc_out_fpath + '.kmc_suf'): return kmc_out_fpath max_mem = max(2, get_free_memory()) run_kmc(['-m' + str(max_mem), '-k' + str(KMERS_LEN), '-fm', '-cx1', '-ci1', fpath, kmc_out_fpath, tmp_dirpath], log_fpath, err_fpath, use_kmc_tools=False) return kmc_out_fpath
def align_reference(ref_fpath, output_dir, using_reads='all', calculate_coverage=False): required_files = [] ref_name = qutils.name_from_fpath(ref_fpath) cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') if using_reads != 'all': cov_fpath = add_suffix(cov_fpath, using_reads) uncovered_fpath = add_suffix(uncovered_fpath, using_reads) insert_size_fpath = join(output_dir, ref_name + '.is.txt') if not is_non_empty_file(uncovered_fpath): required_files.append(uncovered_fpath) if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'pe'): required_files.append(insert_size_fpath) temp_output_dir = join(output_dir, 'temp_output') if not isdir(temp_output_dir): os.makedirs(temp_output_dir) log_path = join(output_dir, 'reads_stats.log') err_fpath = join(output_dir, 'reads_stats.err') correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, qconfig.max_threads, sam_fpath=qconfig.reference_sam, bam_fpath=qconfig.reference_bam, required_files=required_files, is_reference=True, alignment_only=True, using_reads=using_reads) if not qconfig.optimal_assembly_insert_size or qconfig.optimal_assembly_insert_size == 'auto': if using_reads == 'pe' and sam_fpath: insert_size, std_dev = calculate_insert_size(sam_fpath, output_dir, ref_name) if not insert_size: logger.info(' Failed calculating insert size.') else: qconfig.optimal_assembly_insert_size = insert_size elif using_reads == 'all' and is_non_empty_file(insert_size_fpath): try: insert_size = int(open(insert_size_fpath).readline()) if insert_size: qconfig.optimal_assembly_insert_size = insert_size except: pass if not required_files: return sam_fpath, bam_fpath, uncovered_fpath if not sam_fpath: logger.info(' Failed detecting uncovered regions.') return None, None if calculate_coverage: bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(bam_sorted_fpath): logger.info(' Using existing sorted BAM-file: ' + bam_sorted_fpath) else: sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) if not is_non_empty_file(uncovered_fpath) and calculate_coverage: get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return sam_fpath, bam_fpath, uncovered_fpath
def align_reference(ref_fpath, output_dir, using_reads='all'): required_files = [] ref_name = qutils.name_from_fpath(ref_fpath) cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') if using_reads != 'all': cov_fpath = add_suffix(cov_fpath, using_reads) uncovered_fpath = add_suffix(uncovered_fpath, using_reads) insert_size_fpath = join(output_dir, ref_name + '.is.txt') if not is_non_empty_file(uncovered_fpath): required_files.append(uncovered_fpath) if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'paired_end'): required_files.append(insert_size_fpath) temp_output_dir = join(output_dir, 'temp_output') if not isdir(temp_output_dir): os.makedirs(temp_output_dir) log_path = join(output_dir, 'reads_stats.log') err_fpath = join(output_dir, 'reads_stats.err') correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, qconfig.max_threads, sam_fpath=qconfig.reference_sam, bam_fpath=qconfig.reference_bam, required_files=required_files, is_reference=True, alignment_only=True, using_reads=using_reads) qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not qconfig.ideal_assembly_insert_size or qconfig.ideal_assembly_insert_size == 'auto': if using_reads == 'paired_end' and sam_fpath: insert_size = calculate_insert_size(sam_fpath, output_dir, ref_name) if not insert_size: logger.info(' Failed calculating insert size.') else: qconfig.ideal_assembly_insert_size = insert_size if not required_files: return bam_fpath, uncovered_fpath if not sam_fpath: logger.info(' Failed detecting uncovered regions.') return None, None bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(bam_sorted_fpath): logger.info(' Using existing sorted BAM-file: ' + bam_sorted_fpath) else: sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) if not is_non_empty_file(uncovered_fpath): get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return bam_fpath, uncovered_fpath
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath): tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return sam_fpath
def do(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, gc_fpath, features_containers, cov_fpath, output_dir, logger): if not exists(output_dir): os.makedirs(output_dir) conf_fpath, circos_legend_fpath = create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger) circos_exec = get_path_to_program('circos') if not circos_exec: logger.warning( 'Circos is not installed!\n' 'If you want to create Circos plots, install Circos as described at http://circos.ca/tutorials/lessons/configuration/distribution_and_installation ' 'and run the following command:\n\tcircos -conf ' + conf_fpath + '\n' 'The plot legend is saved to ' + circos_legend_fpath + '\n') return None, None cmdline = [circos_exec, '-conf', conf_fpath] log_fpath = join(output_dir, 'circos.log') err_fpath = join(output_dir, 'circos.err') circos_png_fpath = join(output_dir, circos_png_fname) return_code = qutils.call_subprocess(cmdline, stdout=open(log_fpath, 'w'), stderr=open(err_fpath, 'w')) if return_code == 0 and is_non_empty_file(circos_png_fpath): return circos_png_fpath, circos_legend_fpath else: logger.warning(' Circos diagram was not created. See ' + log_fpath + ' and ' + err_fpath + ' for details') return None, None
def check_blast(blast_check_fpath, blast_res_fpath, files_sizes, assemblies_fpaths, assemblies, labels): downloaded_organisms = [] not_founded_organisms = [] blast_assemblies = [assembly for assembly in assemblies] for i, assembly_fpath in enumerate(assemblies_fpaths): check_fpath = get_blast_output_fpath(blast_check_fpath, labels[i]) res_fpath = get_blast_output_fpath(blast_res_fpath, labels[i]) existing_assembly = None assembly_info = True if os.path.exists(check_fpath) and is_non_empty_file(res_fpath): for line in open(check_fpath): if '---' in line: assembly_info = False if line and assembly_info: assembly, size = line.split()[1], line.split()[3] if assembly in files_sizes.keys() and int(size) == files_sizes[assembly]: existing_assembly = assemblies_fpaths[assembly] logger.main_info(' Using existing BLAST alignments for %s... ' % labels[i]) blast_assemblies.remove(existing_assembly) elif line and existing_assembly: line = line.split(' ') if len(line) > 1: if line[0] == 'Downloaded:': downloaded_organisms += line[1].rstrip().split(',') elif line[0] == 'Not_founded:': not_founded_organisms += line[1].rstrip().split(',') return blast_assemblies, set(downloaded_organisms), set(not_founded_organisms)
def align_contigs(output_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath): log_out_f = open(log_out_fpath, 'w') successful_check_fpath = out_basename + '.sf' log_out_f.write('Aligning contigs to reference...\n') # Checking if there are existing previous alignments. # If they exist, using them to save time. using_existing_alignments = False if isfile(successful_check_fpath) and isfile(output_fpath): if check_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath): log_out_f.write('\tUsing existing alignments...\n') logger.info(' ' + qutils.index_to_str(index) + 'Using existing alignments... ') using_existing_alignments = True if not using_existing_alignments: log_out_f.write('\tAligning contigs to the reference\n') logger.info(' ' + qutils.index_to_str(index) + 'Aligning contigs to the reference') tmp_output_fpath = output_fpath + '_tmp' exit_code = run_minimap(tmp_output_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, threads) if exit_code != 0: return AlignerStatus.ERROR if not isfile(tmp_output_fpath): return AlignerStatus.FAILED if not is_non_empty_file(tmp_output_fpath): return AlignerStatus.NOT_ALIGNED create_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath) log_out_f.write('Filtering alignments...\n') parse_minimap_output(tmp_output_fpath, output_fpath) return AlignerStatus.OK
def create_genes_plot(features_containers, window_size, ref_len, output_dir): feature_fpaths = [] max_points = 0 if not features_containers: return feature_fpaths, max_points for feature_container in features_containers: feature_fpath = join(output_dir, feature_container.kind + '.txt') if len(feature_container.region_list) == 0: continue num_points = 0 gene_density_by_chrom = defaultdict(lambda : [0] * (ref_len // window_size + 1)) with open(feature_fpath, 'w') as out_f: for region in feature_container.region_list: chrom = region.chromosome if region.chromosome and region.chromosome in feature_container.chr_names_dict \ else region.seqname chrom = feature_container.chr_names_dict[chrom] if chrom in feature_container.chr_names_dict else None if not chrom: continue for i in range(region.start // window_size, min(region.end // window_size + 1, len(gene_density_by_chrom[chrom]))): if i < len(gene_density_by_chrom[chrom]): gene_density_by_chrom[chrom][i] += 1 for chrom, gene_density_list in gene_density_by_chrom.items(): for i, density in enumerate(gene_density_list): out_f.write('\t'.join([chrom, str(i * window_size), str(((i + 1) * window_size)), str(density)]) + '\n') num_points += 1 if is_non_empty_file(feature_fpath): feature_fpaths.append(feature_fpath) max_points = max(max_points, num_points) return feature_fpaths, max_points
def calculate_insert_size(sam_fpath, output_dir, ref_name): insert_size_fpath = join(output_dir, ref_name + '.is.txt') if is_non_empty_file(insert_size_fpath): try: insert_size = int(open(insert_size_fpath).read()) if insert_size: return insert_size except: pass insert_sizes = [] mapped_flags = ['99', '147', '83', '163'] # reads mapped in correct orientation and within insert size with open(sam_fpath) as sam_in: for i, l in enumerate(sam_in): if i > 1000000: break if l.startswith('@'): continue fs = l.split('\t') flag = fs[1] if flag not in mapped_flags: continue insert_size = abs(int(fs[8])) insert_sizes.append(insert_size) if insert_sizes: mean_is = sum(insert_sizes) * 1.0 / len(insert_sizes) if mean_is <= 0: return None stddev_is = sqrt(sum([(insert_size - mean_is) ** 2 for insert_size in insert_sizes]) / len(insert_sizes)) insert_size = int(mean_is + stddev_is) insert_size = max(qconfig.ideal_assembly_min_IS, insert_size) insert_size = min(qconfig.ideal_assembly_max_IS, insert_size) with open(insert_size_fpath, 'w') as out_f: out_f.write(str(insert_size)) return insert_size
def correct_paired_reads_names(fpath, name_ending, output_dir, logger): name, ext = os.path.splitext(fpath) try: if ext in ['.gz', '.gzip']: handler = gzip.open(fpath, mode='rt') corrected_fpath = join(output_dir, basename(name)) else: handler = open(fpath) corrected_fpath = join(output_dir, basename(fpath)) except IOError: return False if is_non_empty_file(corrected_fpath): logger.info('Using existing FASTQ file ' + corrected_fpath) return corrected_fpath with handler as f: with open(corrected_fpath, 'w') as out_f: for i, line in enumerate(f): if i % 4 == 0: full_read_name = line.split()[0] + name_ending out_f.write(full_read_name + '\n') elif i % 2 == 0: out_f.write('+\n') else: out_f.write(line) return corrected_fpath
def check_blast(blast_check_fpath, blast_res_fpath, files_md5, assemblies_fpaths, assemblies, labels): downloaded_organisms = [] not_founded_organisms = [] blast_assemblies = [assembly for assembly in assemblies] for i, assembly_fpath in enumerate(assemblies_fpaths): check_fpath = get_blast_output_fpath(blast_check_fpath, labels[i]) res_fpath = get_blast_output_fpath(blast_res_fpath, labels[i]) existing_assembly = None assembly_info = True if os.path.exists(check_fpath) and is_non_empty_file(res_fpath): with open(check_fpath) as check_file: for line in check_file: if '---' in line: assembly_info = False if line and assembly_info: assembly, md5 = line.split()[1], line.split()[-1] if assembly in files_md5.keys( ) and md5 == files_md5[assembly]: existing_assembly = assemblies_fpaths[assembly] logger.main_info( ' Using existing BLAST alignments for %s... ' % labels[i]) blast_assemblies.remove(existing_assembly) elif line and existing_assembly: line = line.split(' ') if len(line) > 1: if line[0] == 'Downloaded:': downloaded_organisms += line[1].rstrip().split( ',') elif line[0] == 'Not_founded:': not_founded_organisms += line[1].rstrip( ).split(',') return blast_assemblies, set(downloaded_organisms), set( not_founded_organisms)
def align_contigs(output_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath): log_out_f = open(log_out_fpath, 'w') successful_check_fpath = out_basename + '.sf' log_out_f.write('Aligning contigs to reference...\n') # Special case: if there is a need to reuse alignments from the combined_reference stage if qconfig.alignments_for_reuse_dirpath is not None and os.path.isdir( qconfig.alignments_for_reuse_dirpath): _, coords_to_reuse_fname, _, _ = get_aux_out_fpaths( os.path.basename(out_basename)) coords_to_reuse_fpath = os.path.join( qconfig.alignments_for_reuse_dirpath, coords_to_reuse_fname) if isfile(coords_to_reuse_fpath): # symlink coords.filtered from combined_reference stage to coords in the current run if isfile(output_fpath): os.remove(output_fpath) os.symlink( os.path.relpath(coords_to_reuse_fpath, os.path.dirname(output_fpath)), output_fpath) log_out_f.write( '\tReusing alignments from the combined_reference stage...\n') logger.info( ' ' + qutils.index_to_str(index) + 'Reusing alignments from the combined_reference stage... ') return AlignerStatus.OK qconfig.alignments_for_reuse_dirpath = None # Checking if there are existing previous alignments. # If they exist, using them to save time. if isfile(successful_check_fpath) and isfile(output_fpath): if check_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath): log_out_f.write('\tUsing existing alignments...\n') logger.info(' ' + qutils.index_to_str(index) + 'Using existing alignments... ') return AlignerStatus.OK log_out_f.write('\tAligning contigs to the reference\n') logger.info(' ' + qutils.index_to_str(index) + 'Aligning contigs to the reference') tmp_output_fpath = output_fpath + '_tmp' exit_code = run_minimap(tmp_output_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, threads) if exit_code != 0: return AlignerStatus.ERROR if not isfile(tmp_output_fpath): return AlignerStatus.FAILED if not is_non_empty_file(tmp_output_fpath): return AlignerStatus.NOT_ALIGNED create_successful_check(successful_check_fpath, old_contigs_fpath, ref_fpath) log_out_f.write('Filtering alignments...\n') parse_minimap_output(tmp_output_fpath, output_fpath) return AlignerStatus.OK
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False): red_genome_dir = os.path.join(tmp_dir, 'tmp_red') if isdir(red_genome_dir): shutil.rmtree(red_genome_dir) os.makedirs(red_genome_dir) ref_name = qutils.name_from_fpath(ref_fpath) ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa') ## Red recognizes only *.fa files if os.path.islink(ref_symlink): os.remove(ref_symlink) os.symlink(ref_fpath, ref_symlink) logger.info(' ' + 'Running repeat masking tool...') repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt') if is_non_empty_file(repeats_fpath): return_code = 0 logger.info(' ' + 'Using existing file ' + repeats_fpath + '...') else: return_code = qutils.call_subprocess([binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5'], stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent=' ') if return_code == 0 and repeats_fpath and exists(repeats_fpath): long_repeats_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt') with open(long_repeats_fpath, 'w') as out: with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line[1:]) repeats_fasta_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta') coords_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt') if not is_non_empty_file(coords_fpath): fasta_index_fpath = ref_fpath + '.fai' if exists(fasta_index_fpath): os.remove(fasta_index_fpath) qutils.call_subprocess([bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed', long_repeats_fpath, '-fo', repeats_fasta_fpath], stderr=open(log_fpath, 'w'), indent=' ') cmdline = [minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100', '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath] qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a')) filtered_repeats_fpath, repeats_regions = check_repeats_instances(coords_fpath, long_repeats_fpath, use_long_reads) unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath) return unique_covered_regions, repeats_regions return None, None
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped and proper_pair') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def bwa_index(ref_fpath, err_path, logger): cmd = [bwa_fpath('bwa'), 'index', '-p', ref_fpath, ref_fpath] if getsize(ref_fpath) > 2 * 1024**3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] if not is_non_empty_file(ref_fpath + '.bwt'): qutils.call_subprocess(cmd, stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, output_dir, max_threads, err_fpath): merged_bam_fpath = add_suffix(bam_fpath, 'merged') tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sambamba_view(tmp_sam_fpath, tmp_bam_fpath, max_threads, err_fpath, logger, filter_rule=None) sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), merged_bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, merged_bam_fpath, bam_fpath], stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return merged_bam_fpath
def run(contigs_fpath, gff_fpath, log_fpath, threads, kingdom): barrnap_fpath = join(qconfig.LIBS_LOCATION, 'barrnap', 'bin', 'barrnap') if is_non_empty_file(gff_fpath): return call_subprocess([ barrnap_fpath, '--quiet', '-k', kingdom, '--threads', str(threads), contigs_fpath ], stdout=open(gff_fpath, 'w'), stderr=open(log_fpath, 'a'))
def get_joiners(ref_name, sam_fpath, bam_fpath, output_dirpath, err_fpath, using_reads): bam_filtered_fpath = add_suffix(bam_fpath, 'filtered') if not is_non_empty_file(bam_filtered_fpath): filter_rule = 'not unmapped and not supplementary and not secondary_alignment' sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger, filter_rule=filter_rule) bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_sorted_fpath): sort_bam(bam_filtered_fpath, bam_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, using_reads, bam_sorted_fpath, err_fpath, logger, bedpe=using_reads == 'mp') intervals = defaultdict(list) if using_reads == 'mp': insert_size, std_dev = calculate_insert_size(sam_fpath, output_dirpath, ref_name, reads_suffix='mp') min_is = insert_size - std_dev max_is = insert_size + std_dev with open(bed_fpath) as bed: for l in bed: fs = l.split() if using_reads == 'mp' and insert_size: interval_len = int(fs[2]) - int(fs[1]) if min_is <= abs(interval_len) <= max_is: intervals[fs[0]].append((int(fs[1]), int(fs[2]))) else: intervals[fs[0]].append((int(fs[1]), int(fs[2]))) return intervals
def run_aligner(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): if reads_type == 'pacbio' or reads_type == 'nanopore': if reads_type == 'pacbio': preset = ' -ax map-pb ' else: preset = ' -ax map-ont ' cmdline = minimap_fpath() + ' -t ' + str(max_threads) + preset + ref_fpath + ' ' + reads else: cmdline = bwa_cmd + (' -p ' if reads_type == 'pe' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmdline = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) bam_fpath = output_fpath.replace('.sam', '.bam') if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmdline), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_fpath): if not is_non_empty_file(bam_fpath): sambamba_view(output_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) if reads_type == 'pe': bam_dedup_fpath = add_suffix(bam_fpath, 'dedup') qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, bam_fpath, bam_dedup_fpath], stderr=open(err_fpath, 'a'), logger=logger) if exists(bam_dedup_fpath): shutil.move(bam_dedup_fpath, bam_fpath) if reads_type == 'pe': insert_size, std_dev = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.optimal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.optimal_assembly_insert_size = max(insert_sizes) ref_name = qutils.name_from_fpath(ref_fpath) insert_size_fpath = join(output_dir, '..', ref_name + '.is.txt') with open(insert_size_fpath, 'w') as out: out.write(str(qconfig.optimal_assembly_insert_size))
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath=None, uncovered_fpath=None, create_cov_files=True): raw_cov_fpath = cov_fpath + '_raw' chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names) if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): sort_bam(bam_fpath, bam_sorted_fpath, log_path, err_fpath, logger) calculate_genome_cov(bam_sorted_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') if uncovered_fpath: print_uncovered_regions(raw_cov_fpath, uncovered_fpath, correct_chr_names) if create_cov_files: proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names) if not is_non_empty_file(physical_cov_fpath) and create_cov_files: raw_cov_fpath = get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, physical_cov_fpath, chr_len_fpath) proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names) return cov_fpath, physical_cov_fpath
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath): sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') bam_mapped_fpath = add_suffix(bam_fpath, 'mapped') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_fpath): bwa_index(ref_fpath, err_fpath, logger) qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath], stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', bam_fpath], stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') ref_name = qutils.name_from_fpath(ref_fpath) correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger) get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return uncovered_fpath
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath): if not isfile(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None raw_cov_fpath = add_suffix(cov_fpath, 'raw') if not is_non_empty_file(raw_cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam') sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='proper_pair and not supplementary and not duplicate') ## sort by read names bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam') sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True) calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger) return raw_cov_fpath
def run_bwa(read_fpaths, ref_fpath, sam_fpath, out_sam_fpaths, output_dir, err_fpath, max_threads, reads_type): bwa_cmd = bwa_fpath('bwa') + ' mem -t ' + str(max_threads) insert_sizes = [] for idx, reads in enumerate(read_fpaths): if isinstance(reads, str): cmd = bwa_cmd + (' -p ' if reads_type != 'single' else ' ') + ref_fpath + ' ' + reads else: read1, read2 = reads cmd = bwa_cmd + ' ' + ref_fpath + ' ' + read1 + ' ' + read2 output_fpath = add_suffix(sam_fpath, reads_type + str(idx + 1)) if not is_non_empty_file(output_fpath): qutils.call_subprocess(shlex.split(cmd), stdout=open(output_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if reads_type == 'paired_end': insert_size = calculate_insert_size(output_fpath, output_dir, basename(sam_fpath)) if insert_size < qconfig.ideal_assembly_max_IS: insert_sizes.append(insert_size) out_sam_fpaths.append(output_fpath) if insert_sizes: qconfig.ideal_assembly_insert_size = max(insert_sizes)
def calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath): if not reads_fpaths or not sam_fpath: return lap_out_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.lap.out') if not is_non_empty_file(lap_out_fpath): if index is not None: logger.info(' ' + index_str + 'Running LAP...') else: logger.info(' Running LAP for reference...') prob_out_fpath = get_safe_fpath(output_dirpath, filename + '.prob') qutils.call_subprocess([lap_fpath('calc_prob.py'), '-a', fpath, '-i', ','.join(reads_fpaths), '-q', '-s', sam_fpath], stdout=open(prob_out_fpath, 'w'), stderr=open(err_fpath, 'a')) qutils.call_subprocess([lap_fpath('sum_prob.py'), '-i', prob_out_fpath], stdout=open(lap_out_fpath, 'w'), stderr=open(err_fpath, 'a')) else: if index is not None: logger.info(' ' + index_str + 'Using existing file with LAP score...') else: logger.info(' Using existing file with LAP score for reference...')
def calculate_insert_size(sam_fpath, output_dir, ref_name, reads_suffix=''): insert_size_fpath = join(output_dir, ref_name + reads_suffix + '.is.txt') if is_non_empty_file(insert_size_fpath): try: with open(insert_size_fpath) as f: insert_size = int(f.readline()) std_dev = int(f.readline()) if insert_size: return insert_size, std_dev except: pass insert_sizes = [] mapped_flags = ['99', '147', '83', '163'] # reads mapped in correct orientation and within insert size with open(sam_fpath) as sam_in: for i, l in enumerate(sam_in): if i > 1000000: break if l.startswith('@'): continue fs = l.split('\t') flag = fs[1] if flag not in mapped_flags: continue insert_size = abs(int(fs[8])) insert_sizes.append(insert_size) if insert_sizes: insert_sizes = sorted(insert_sizes) if len(insert_sizes) % 2 == 1: # odd number of values median_is = insert_sizes[(len(insert_sizes) - 1) // 2] else: # even number of values - take the avg of central median_is = (insert_sizes[len(insert_sizes) // 2] + insert_sizes[len(insert_sizes) // 2 - 1]) // 2 if median_is <= 0: return None, None std_dev = sqrt(sum([(insert_size - median_is) ** 2 for insert_size in insert_sizes]) / len(insert_sizes)) insert_size = max(qconfig.optimal_assembly_min_IS, median_is) with open(insert_size_fpath, 'w') as out_f: out_f.write(str(insert_size) + '\n') out_f.write(str(std_dev)) return insert_size, std_dev return None, None
def do(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, gc_fpath, features_containers, cov_fpath, output_dir, logger): if not exists(output_dir): os.makedirs(output_dir) conf_fpath, circos_legend_fpath = create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger) circos_exec = get_path_to_program('circos') if not circos_exec: logger.warning('Circos is not installed!\n' 'If you want to create Circos plots, install Circos as described at http://circos.ca/tutorials/lessons/configuration/distribution_and_installation ' 'and run the following command:\n circos -conf ' + conf_fpath + '.\n ' 'The plot annotation is saved to ' + circos_legend_fpath) return None, None cmdline = [circos_exec, '-conf', conf_fpath] log_fpath = join(output_dir, 'circos.log') err_fpath = join(output_dir, 'circos.err') circos_png_fpath = join(output_dir, circos_png_fname) return_code = qutils.call_subprocess(cmdline, stdout=open(log_fpath, 'w'), stderr=open(err_fpath, 'w')) if return_code == 0 and is_non_empty_file(circos_png_fpath): return circos_png_fpath, circos_legend_fpath else: logger.warning(' Circos diagram was not created. See ' + log_fpath + ' and ' + err_fpath + ' for details') return None, None
def download_ref(organism, ref_fpath): ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' quast_fields = '&tool=quast&[email protected]' organism = organism.replace('_', '+') response = try_send_request( ncbi_url + 'esearch.fcgi?db=assembly&term=%s+[Organism]&retmax=100' % organism + quast_fields) if not response: return None xml_tree = ET.fromstring(response) if xml_tree.find('Count').text == '0': # Organism is not found return None ref_id_list = xml_tree.find('IdList').findall('Id') best_ref_links = [] for id in ref_id_list: databases = ['assembly_nuccore_refseq', 'assembly_nuccore_insdc'] for db in databases: response = try_send_request( ncbi_url + 'elink.fcgi?dbfrom=assembly&db=nuccore&id=%s&linkname="%s"' % (id.text, db) + quast_fields) if not response: continue xml_tree = ET.fromstring(response) link_set = xml_tree.find('LinkSet') if link_set is None: continue link_db = xml_tree.find('LinkSet').find('LinkSetDb') if link_db is None: continue ref_links = link_db.findall('Link') if best_ref_links and len(ref_links) > len(best_ref_links): continue best_ref_links = ref_links if best_ref_links: break if best_ref_links and len(best_ref_links) < 3: break if not best_ref_links: return None if len(best_ref_links) > 500: logger.info( '%s has too fragmented reference genome! It will not be downloaded.' % organism.replace('+', ' ')) return None ref_ids = sorted(link.find('Id').text for link in best_ref_links) is_first_piece = False fasta_files = [] for ref_id in ref_ids: fasta = try_send_request( ncbi_url + 'efetch.fcgi?db=sequences&id=%s&rettype=fasta&retmode=text' % ref_id) if fasta and fasta[0] == '>': fasta_files.append(fasta) fasta_names = [f.split('|')[-1] for f in fasta_files] with open(ref_fpath, "w") as fasta_file: for name, fasta in sorted(zip(fasta_names, fasta_files), key=natural_sort_key): if not is_first_piece: is_first_piece = True else: fasta = '\n' + fasta.rstrip() fasta_file.write(fasta.rstrip()) if not os.path.isfile(ref_fpath): return None if not is_non_empty_file(ref_fpath): os.remove(ref_fpath) return None return ref_fpath
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None, index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'): filename = qutils.name_from_fpath(fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') if using_reads != 'all': sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)): required_files.append(sam_fpath) stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat') index_str = qutils.index_to_str(index) if index is not None else '' reads_fpaths = qconfig.reads_fpaths correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) can_reuse = correct_chr_names is not None if not can_reuse and not reads_fpaths: return None, None, None if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)): if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) if isfile(stats_fpath) or alignment_only: return correct_chr_names, sam_fpath, bam_fpath logger.info(' ' + index_str + 'Pre-processing reads...') if is_non_empty_file(sam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif is_non_empty_file(bam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths: if is_reference: logger.info(' Running BWA for reference...') else: logger.info(' ' + index_str + 'Running BWA...') # use absolute paths because we will change workdir fpath = abspath(fpath) sam_fpath = abspath(sam_fpath) prev_dir = os.getcwd() os.chdir(output_dirpath) bwa_index(fpath, err_fpath, logger) sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads) if len(sam_fpaths) > 1: merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath) elif len(sam_fpaths) == 1: shutil.move(sam_fpaths[0], sam_fpath) tmp_bam_fpath = sam_fpaths[0].replace('.sam', '.bam') if is_non_empty_file(tmp_bam_fpath): shutil.move(tmp_bam_fpath, bam_fpath) logger.info(' ' + index_str + 'Done.') os.chdir(prev_dir) if not is_non_empty_file(sam_fpath): logger.error(' Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.') return None, None, None correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif not correct_chr_names or not is_non_empty_file(sam_fpath): return None, None, None if is_reference: logger.info(' Sorting SAM-file for reference...') else: logger.info(' ' + index_str + 'Sorting SAM-file...') if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath): logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) else: correct_sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.correct.sam') # write in output dir sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath) sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) qutils.assert_file_exists(bam_fpath, 'bam file') if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) if is_reference: logger.info(' Analysis for reference is finished.') else: logger.info(' ' + index_str + 'Analysis is finished.') return correct_chr_names, sam_fpath, bam_fpath
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir, log_path, err_fpath): required_files = [] bed_fpath, cov_fpath, physical_cov_fpath = None, None, None if main_ref_fpath: ref_name = qutils.name_from_fpath(main_ref_fpath) bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed') cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov') required_files = [bed_fpath, cov_fpath, physical_cov_fpath] if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) elif not qconfig.forward_reads and not qconfig.interlaced_reads: if not qconfig.reference_sam and not qconfig.reference_bam: logger.info(' Will not search Structural Variations (needs paired-end reads)') bed_fpath = None qconfig.no_sv = True if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): required_files = [] n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1) max_threads_per_job = max(1, qconfig.max_threads // n_jobs) sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths) bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths) parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)] if main_ref_fpath: parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True)) correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs) qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)] qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)] add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath) save_reads(output_dir) if not main_ref_fpath: return None, None, None correct_chr_names = correct_chr_names[-1] sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1] qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not required_files: return bed_fpath, cov_fpath, physical_cov_fpath if not all([sam_fpath, bam_fpath]): logger.info(' Failed searching structural variations.') return None, None, None sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted')) bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: if not is_non_empty_file(bam_sorted_fpath): sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_lengths = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_lengths[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False ref_files = {} if meta_ref_fpaths: global ref_sam_fpaths for cur_ref_fpath in meta_ref_fpaths: cur_ref_name = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam') ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath if is_non_empty_file(ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath)) ref_files[cur_ref_name] = None else: ref_sam_file = open(ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): ref_sam_file.write(headers[0] + '\n') for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name: ref_sam_file.write(h + '\n') ref_sam_file.write(headers[-1] + '\n') ref_files[cur_ref_name] = ref_sam_file need_ref_splitting = True trivial_deletions_fpath = \ search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting) if get_gridss_fpath() and isfile(get_gridss_fpath()): try: gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath) qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def add_statistics_to_report(output_dir, contigs_fpaths, ref_fpath): from quast_libs import reporting ref_reads_stats = None ref_lap_score = None if ref_fpath: ref_name = qutils.name_from_fpath(ref_fpath) stats_fpath = join(output_dir, ref_name + '.stat') if isfile(stats_fpath): ref_reads_stats = parse_reads_stats(stats_fpath) if int(ref_reads_stats['mapped']) == 0: logger.info(' BWA: nothing aligned for reference.') lap_out_fpath = get_safe_fpath(output_dir, ref_name + '.lap.out') if is_non_empty_file(lap_out_fpath): with open(lap_out_fpath) as f: l = f.readline() ref_lap_score = float(l.split()[0]) if l else None # process all contigs files for index, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) stats_fpath = join(output_dir, assembly_name + '.stat') if ref_reads_stats: report.add_field(reporting.Fields.REF_MAPPED_READS, ref_reads_stats['mapped']) report.add_field(reporting.Fields.REF_MAPPED_READS_PCNT, ref_reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS, ref_reads_stats['paired']) report.add_field(reporting.Fields.REF_PROPERLY_PAIRED_READS_PCNT, ref_reads_stats['paired_pcnt']) report.add_field(reporting.Fields.REF_SINGLETONS, ref_reads_stats['singletons']) report.add_field(reporting.Fields.REF_SINGLETONS_PCNT, ref_reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.REF_MISJOINT_READS, ref_reads_stats['misjoint']) report.add_field(reporting.Fields.REF_MISJOINT_READS_PCNT, ref_reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.REF_DEPTH, ref_reads_stats['depth']) if ref_reads_stats['coverage_thresholds'] and len(ref_reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.REF_COVERAGE__FOR_THRESHOLDS, [ref_reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.REF_COVERAGE_1X_THRESHOLD, ref_reads_stats['coverage_thresholds'][0]) if not isfile(stats_fpath): continue reads_stats = parse_reads_stats(stats_fpath) report.add_field(reporting.Fields.TOTAL_READS, reads_stats['total']) report.add_field(reporting.Fields.LEFT_READS, reads_stats['left']) report.add_field(reporting.Fields.RIGHT_READS, reads_stats['right']) report.add_field(reporting.Fields.MAPPED_READS, reads_stats['mapped']) report.add_field(reporting.Fields.MAPPED_READS_PCNT, reads_stats['mapped_pcnt']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS, reads_stats['paired']) report.add_field(reporting.Fields.PROPERLY_PAIRED_READS_PCNT, reads_stats['paired_pcnt']) if int(reads_stats['mapped']) == 0: logger.info(' ' + qutils.index_to_str(index) + 'BWA: nothing aligned for ' + '\'' + assembly_label + '\'.') report.add_field(reporting.Fields.SINGLETONS, reads_stats['singletons']) report.add_field(reporting.Fields.SINGLETONS_PCNT, reads_stats['singletons_pcnt']) report.add_field(reporting.Fields.MISJOINT_READS, reads_stats['misjoint']) report.add_field(reporting.Fields.MISJOINT_READS_PCNT, reads_stats['misjoint_pcnt']) report.add_field(reporting.Fields.DEPTH, reads_stats['depth']) if reads_stats['coverage_thresholds'] and len(reads_stats['coverage_thresholds']) == len(qconfig.coverage_thresholds): report.add_field(reporting.Fields.COVERAGE__FOR_THRESHOLDS, [reads_stats['coverage_thresholds'][i] for i, threshold in enumerate(qconfig.coverage_thresholds)]) report.add_field(reporting.Fields.COVERAGE_1X_THRESHOLD, reads_stats['coverage_thresholds'][0]) lap_out_fpath = get_safe_fpath(output_dir, assembly_name + '.lap.out') if is_non_empty_file(lap_out_fpath): with open(lap_out_fpath) as f: l = f.readline() lap_score = float(l.split()[0]) if l else None report.add_field(reporting.Fields.LAP_SCORE, ('%.3f' % lap_score if lap_score is not None else None)) report.add_field(reporting.Fields.REF_LAP_SCORE, ('%.3f' % ref_lap_score if ref_lap_score is not None else None))
def download_refs(organism, ref_fpath): ncbi_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/' quast_fields = '&tool=quast&[email protected]' organism = organism.replace('_', '+') response = try_send_request(ncbi_url + 'esearch.fcgi?db=assembly&term=%s+[Organism]&retmax=100' % organism + quast_fields) if not response: return None xml_tree = ET.fromstring(response) if xml_tree.find('Count').text == '0': # Organism is not found return None ref_id_list = xml_tree.find('IdList').findall('Id') best_ref_links = [] for id in ref_id_list: databases = ['assembly_nuccore_refseq', 'assembly_nuccore_insdc'] for db in databases: response = try_send_request( ncbi_url + 'elink.fcgi?dbfrom=assembly&db=nuccore&id=%s&linkname="%s"' % (id.text, db) + quast_fields) if not response: continue xml_tree = ET.fromstring(response) link_set = xml_tree.find('LinkSet') if link_set is None: continue link_db = xml_tree.find('LinkSet').find('LinkSetDb') if link_db is None: continue ref_links = link_db.findall('Link') if best_ref_links and len(ref_links) > len(best_ref_links): continue best_ref_links = ref_links if best_ref_links: break if best_ref_links and len(best_ref_links) < 3: break if not best_ref_links: return None if len(best_ref_links) > 500: logger.info('%s has too fragmented reference genome! It will not be downloaded.' % organism.replace('+', ' ')) return None ref_ids = sorted(link.find('Id').text for link in best_ref_links) is_first_piece = False fasta_files = [] for ref_id in ref_ids: fasta = try_send_request(ncbi_url + 'efetch.fcgi?db=sequences&id=%s&rettype=fasta&retmode=text' % ref_id) if fasta and fasta[0] == '>': fasta_files.append(fasta) fasta_names = [f.split('|')[-1] for f in fasta_files] with open(ref_fpath, "w") as fasta_file: for name, fasta in sorted(zip(fasta_names, fasta_files), key=natural_sort_key): if not is_first_piece: is_first_piece = True else: fasta = '\n' + fasta.rstrip() fasta_file.write(fasta.rstrip()) if not os.path.isfile(ref_fpath): return None if not is_non_empty_file(ref_fpath): os.remove(ref_fpath) return None return ref_fpath
def bwa_index(ref_fpath, err_path, logger): cmd = [bwa_fpath('bwa'), 'index', '-p', ref_fpath, ref_fpath] if getsize(ref_fpath) > 2 * 1024 ** 3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] if not is_non_empty_file(ref_fpath + '.bwt'): qutils.call_subprocess(cmd, stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger)
def main(in_fpath, out_fname): """ This function runs a BUSCO analysis according to the provided parameters. See the help for more details: ``python run_BUSCO.py -h`` :raises SystemExit: if any errors occur """ start_time = time.time() # 1) Load a busco config file that will figure out all the params from all sources # i.e. provided config file, dataset cfg, and user args if os.environ.get('BUSCO_CONFIG_FILE') and os.access(os.environ.get('BUSCO_CONFIG_FILE'), os.R_OK): config_file = os.environ.get('BUSCO_CONFIG_FILE') else: config_file = '%s//config.ini.default' % os.path.dirname(os.path.realpath(__file__)) config = BuscoConfig(config_file, args={'in': in_fpath, 'out': out_fname}) # Define a logger, the config is passed to tell the logger if you required the quiet mode assembly_dirpath = os.path.join(config.get('busco', 'out_path'), 'run_%s' % out_fname) if not isdir(assembly_dirpath): os.makedirs(assembly_dirpath) summary_path = os.path.join(assembly_dirpath, 'short_summary_%s.txt' % out_fname) from quast_libs.busco import pipebricks pipebricks.PipeLogger.run_dirpath = assembly_dirpath from quast_libs.busco.GenomeAnalysis import GenomeAnalysis from quast_libs.busco.BuscoAnalysis import BuscoAnalysis from quast_libs.busco.pipebricks.Toolset import ToolException BuscoAnalysis._logger.reload_log() logger = BuscoAnalysis._logger if is_non_empty_file(summary_path): logger.info('Using existing BUSCO files for ' + out_fname + '...') return summary_path try: try: logger.info( '****************** Start a BUSCO %s analysis, current time: %s **' '****************' % (BuscoConfig.VERSION, time.strftime('%m/%d/%Y %H:%M:%S'))) logger.info('Configuration loaded from %s' % config_file) # 2) Load the analysis, this will check the dependencies and return the appropriate analysis object analysis = GenomeAnalysis(config) # 3) Run the analysis analysis.run_analysis() if not logger.has_warning(): logger.info('BUSCO analysis done. Total running time: %s seconds' % str(time.time() - start_time)) else: logger.info('BUSCO analysis done with WARNING(s). Total running time: %s seconds' % str(time.time() - start_time)) logger.info('Results written in %s\n' % analysis.mainout) except ToolException as e: # logger.error(e) raise SystemExit except SystemExit: logger.error('BUSCO analysis failed !') logger.error( 'Check the logs, read the user guide, if you still need technical ' 'support, then please contact %s\n' % BuscoConfig.CONTACT) raise SystemExit except KeyboardInterrupt: logger.error('A signal was sent to kill the process') logger.error('BUSCO analysis failed !') logger.error( 'Check the logs, read the user guide, if you still need technical ' 'support, then please contact %s\n' % BuscoConfig.CONTACT) raise SystemExit except BaseException: exc_type, exc_value, exc_traceback = sys.exc_info() logger.critical('Unhandled exception occurred: %s\n' % traceback.format_exception( exc_type, exc_value, exc_traceback)) logger.error('BUSCO analysis failed !') logger.error( 'Check the logs, read the user guide, if you still need technical ' 'support, then please contact %s\n' % BuscoConfig.CONTACT) raise SystemExit return summary_path
def align_single_file(fpath, main_output_dir, output_dirpath, log_path, err_fpath, max_threads, sam_fpath=None, bam_fpath=None, index=None, required_files=None, is_reference=False, alignment_only=False, using_reads='all'): filename = qutils.name_from_fpath(fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or join(output_dirpath, filename + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') if using_reads != 'all': sam_fpath = join(output_dirpath, filename + '.' + using_reads + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') if alignment_only or (is_reference and required_files and any(f.endswith('bed') for f in required_files)): required_files.append(sam_fpath) stats_fpath = get_safe_fpath(dirname(output_dirpath), filename + '.stat') index_str = qutils.index_to_str(index) if index is not None else '' reads_fpaths = qconfig.reads_fpaths correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) can_reuse = correct_chr_names is not None if not can_reuse and not reads_fpaths: return None, None, None if correct_chr_names and (not required_files or all(isfile(fpath) for fpath in required_files)): if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath) if isfile(stats_fpath) or alignment_only: return correct_chr_names, sam_fpath, bam_fpath logger.info(' ' + index_str + 'Pre-processing reads...') if is_non_empty_file(sam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif is_non_empty_file(bam_fpath) and can_reuse: logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) sambamba_view(bam_fpath, sam_fpath, qconfig.max_threads, err_fpath, logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) if (not correct_chr_names or not is_non_empty_file(sam_fpath)) and reads_fpaths: if is_reference: logger.info(' Running BWA for reference...') else: logger.info(' ' + index_str + 'Running BWA...') # use absolute paths because we will change workdir fpath = abspath(fpath) sam_fpath = abspath(sam_fpath) prev_dir = os.getcwd() os.chdir(output_dirpath) bwa_index(fpath, err_fpath, logger) sam_fpaths = align_reads(fpath, sam_fpath, using_reads, main_output_dir, err_fpath, max_threads) if len(sam_fpaths) > 1: merge_sam_files(sam_fpaths, sam_fpath, bam_fpath, main_output_dir, max_threads, err_fpath) elif len(sam_fpaths) == 1: shutil.move(sam_fpaths[0], sam_fpath) sambamba_view(sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) logger.info(' ' + index_str + 'Done.') os.chdir(prev_dir) if not is_non_empty_file(sam_fpath): logger.error(' Failed running BWA for ' + fpath + '. See ' + log_path + ' for information.') return None, None, None correct_chr_names = get_correct_names_for_chroms(output_dirpath, fpath, sam_fpath, err_fpath, reads_fpaths, logger, is_reference) elif not correct_chr_names or not is_non_empty_file(sam_fpath): return None, None, None if is_reference: logger.info(' Sorting SAM-file for reference...') else: logger.info(' ' + index_str + 'Sorting SAM-file...') if can_reuse and is_non_empty_file(bam_fpath) and all_read_names_correct(sam_fpath): logger.info(' ' + index_str + 'Using existing BAM-file: ' + bam_fpath) else: correct_sam_fpath = join(output_dirpath, filename + '.correct.sam') # write in output dir sam_fpath = clean_read_names(sam_fpath, correct_sam_fpath) sambamba_view(correct_sam_fpath, bam_fpath, max_threads, err_fpath, logger, filter_rule=None) qutils.assert_file_exists(bam_fpath, 'bam file') if not alignment_only: if isfile(stats_fpath): logger.info(' ' + index_str + 'Using existing flag statistics file ' + stats_fpath) elif isfile(bam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'flagstat', '-t', str(max_threads), bam_fpath], stdout=open(stats_fpath, 'w'), stderr=open(err_fpath, 'a')) analyse_coverage(output_dirpath, fpath, correct_chr_names, bam_fpath, stats_fpath, err_fpath, logger) calc_lap_score(reads_fpaths, sam_fpath, index, index_str, output_dirpath, fpath, filename, err_fpath) if is_reference: logger.info(' Analysis for reference is finished.') else: logger.info(' ' + index_str + 'Analysis is finished.') return correct_chr_names, sam_fpath, bam_fpath
def download_ref(organism, ref_fpath, max_ref_fragments): organism = organism.replace('_', '+') isolate = '' strain = '' if '+isolate+' in organism: organism, isolate = organism.split('+isolate+') if '+strain+' in organism: organism, strain = organism.split('+strain+') response = try_send_request( ncbi_url + 'esearch.fcgi?db=assembly&term=%s+[Organism]%s%s&retmax=100%s' % (organism, (isolate + '+[Isolate]') if isolate else '', (strain + '+[Strain]') if strain else '', quast_fields)) if not response: return None xml_tree = ET.fromstring(response) if xml_tree.find('Count').text == '0': # Organism is not found return None ref_id_list = xml_tree.find('IdList').findall('Id') best_ref_links = get_download_links( ref_id_list, "assembly_nuccore_refseq+OR+assembly_nuccore_insdc") used_db = "refseq" if not best_ref_links: used_db = "wgsmaster" best_ref_links = get_download_links(ref_id_list, "assembly_nuccore_wgsmaster") if len(best_ref_links) > max_ref_fragments: logger.info( '%s has too fragmented reference genome! It will not be downloaded.' % organism.replace('+', ' ')) return None if used_db == "refseq" and best_ref_links: ref_ids = sorted(link.find('Id').text for link in best_ref_links) is_first_piece = False fasta_files = [] chunk_size = 200 for i in range(0, len(ref_ids), chunk_size): fasta = try_send_request( ncbi_url + 'efetch.fcgi?db=sequences&id=%s&rettype=fasta&retmode=text' % ','.join(ref_ids[i:i + chunk_size])) if fasta and fasta[0] == '>': fasta_files.extend(fasta.rstrip().split('\n\n')) fasta_names = [f.split(' ')[0] for f in fasta_files] with open(ref_fpath, "w") as fasta_file: for name, fasta in sorted(zip(fasta_names, fasta_files), key=natural_sort_key): if not is_first_piece: is_first_piece = True else: fasta = '\n' + fasta.rstrip() fasta_file.write(fasta.rstrip()) elif best_ref_links: ## download WGS assembly try: download_wgsmaster_contigs(best_ref_links[0].find('Id').text, ref_fpath) except: logger.info('Failed downloading %s!' % organism.replace('+', ' ')) if not os.path.isfile(ref_fpath): return None if not is_non_empty_file(ref_fpath): os.remove(ref_fpath) return None return ref_fpath
def download_blastdb(logger=logger, only_clean=False): global blastdb_dirpath blastdb_dirpath = get_dir_for_download('silva', 'Silva', [silva_downloaded_fname + '.nsq'], logger, only_clean=only_clean) if not blastdb_dirpath: return False if only_clean: if os.path.isdir(blastdb_dirpath): logger.info('Removing ' + blastdb_dirpath) shutil.rmtree(blastdb_dirpath) return True global db_fpath db_fpath = join(blastdb_dirpath, silva_downloaded_fname) if os.path.isfile(db_fpath + '.nsq') and os.path.getsize(db_fpath + '.nsq') >= db_nsq_fsize: return True log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log') db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz') silva_fpath = os.path.join(blastdb_dirpath, silva_fname) logger.info() if os.path.isfile(db_gz_fpath): logger.info('SILVA 16S ribosomal RNA gene database has already been downloaded.') else: logger.info('Downloading SILVA 16S ribosomal RNA gene database...') if not os.path.isdir(blastdb_dirpath): os.makedirs(blastdb_dirpath) silva_download = urllib.FancyURLopener() silva_remote_fpath = silva_db_url + silva_fname + '.gz' silva_download_in_progress_path = db_gz_fpath + '.download' try: silva_download.retrieve(silva_remote_fpath, silva_download_in_progress_path, show_progress) if not qutils.is_non_empty_file(silva_download_in_progress_path, min_size=1024*1024): raise ValueError except Exception: logger.error( 'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. ' 'Try to download it manually, put under %s/ and restart your command.' % (silva_remote_fpath, blastdb_dirpath)) return False shutil.move(silva_download_in_progress_path, db_gz_fpath) logger.info('Processing downloaded file. Logging to %s...' % log_fpath) if not qutils.is_non_empty_file(silva_fpath): logger.info('Unpacking and replacing " " with "_"...') unpacked_fpath = silva_fpath + ".unpacked" cmd = "gunzip -c %s" % db_gz_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger) substituted_fpath = silva_fpath + ".substituted" with open(unpacked_fpath) as in_file: with open(substituted_fpath, 'w') as out_file: for line in in_file: out_file.write(line.replace(' ', '_')) os.remove(unpacked_fpath) shutil.move(substituted_fpath, silva_fpath) logger.info('Making BLAST database...') cmd = get_blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath)) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger) if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: logger.error('Failed to make BLAST database ("' + blastdb_dirpath + '"). See details in log. Try to make it manually: %s' % cmd) return False elif not qconfig.debug: os.remove(db_gz_fpath) os.remove(silva_fpath) return True
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 5: len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1]) len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1]) len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) checked_assemblies.append(contigs_fpath) contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies] if len(contigs_fpaths) == 0: logger.info('Done.') return if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath): logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None logger.info('Running KMC on reference...') log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: return logger.info('Analyzing assemblies completeness...') kmc_out_fpaths = [] for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info('Analyzing assemblies accuracy...') if len(kmc_out_fpaths) > 1: shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath) else: shared_kmc_db = kmc_out_fpaths[0] kmer_fraction = 100 if getsize(ref_fpath) < 500 * 1024 ** 2 else 1000 shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction) shared_kmers_by_chrom = dict() shared_kmers_fpath = join(tmp_dirpath, 'shared_kmers.txt') ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) with open(shared_kmers_fpath, 'w') as out_f: for name, seq in ref_contigs.items(): seq_kmers = get_string_kmers(tmp_dirpath, log_fpath, err_fpath, seq=seq, intersect_with=shared_downsampled_kmc_db) for kmer_i, kmer in enumerate(seq_kmers): shared_kmers_by_chrom[str(kmer)] = name out_f.write('>' + str(kmer_i) + '\n') out_f.write(kmer + '\n') shared_kmc_db = count_kmers(tmp_dirpath, shared_kmers_fpath, log_fpath, err_fpath) ref_kmc_dbs = [] for ref_name, ref_seq in ref_contigs.items(): ref_contig_fpath = join(tmp_dirpath, ref_name + '.fa') if not is_non_empty_file(ref_contig_fpath): with open(ref_contig_fpath, 'w') as out_f: out_f.write(ref_seq) ref_kmc_db = count_kmers(tmp_dirpath, ref_contig_fpath, log_fpath, err_fpath) ref_shared_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, shared_kmc_db], log_fpath, err_fpath) ref_kmc_dbs.append((ref_name, ref_shared_kmc_db)) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = None len_map_to_multi_chrom = None len_map_to_none_chrom = None total_len = 0 long_contigs = [] contig_lens = dict() contig_markers = defaultdict(list) for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(seq) >= MIN_CONTIGS_LEN: long_contigs.append(len(seq)) if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5: logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.') elif len(ref_kmc_dbs) > MAX_CONTIGS_NUM: logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.') else: len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 for name, seq in read_fasta(contigs_fpath): if len(seq) < MIN_CONTIGS_LEN: continue tmp_contig_fpath = join(tmp_dirpath, name + '.fa') with open(tmp_contig_fpath, 'w') as out_tmp_f: out_tmp_f.write(seq) contig_kmc_db = count_kmers(tmp_dirpath, tmp_contig_fpath, log_fpath, err_fpath) intersect_all_ref_kmc_db = intersect_kmers(tmp_dirpath, [contig_kmc_db, shared_kmc_db], log_fpath, err_fpath) kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_all_ref_kmc_db, log_fpath, err_fpath) if kmers_cnt < MIN_MARKERS: continue for ref_name, ref_kmc_db in ref_kmc_dbs: intersect_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, intersect_all_ref_kmc_db], log_fpath, err_fpath) kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_kmc_db, log_fpath, err_fpath) if kmers_cnt: contig_markers[name].append(ref_name) for name, chr_markers in contig_markers.items(): if len(chr_markers) == 1: len_map_to_one_chrom += contig_lens[name] else: len_map_to_multi_chrom += contig_lens[name] len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')