def run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): # run minimap2 for AGB mask_level = '1' if qconfig.min_IDY < 95 else '0.9' cmdline = [minimap_fpath(), '-cx', 'asm20', '--mask-level', mask_level, '-N', '100', '--score-N', '0', '-E', '1,0', '-f', '200', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): if qconfig.is_agv_mode: return run_minimap_agv(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads) preset = 'asm5' if qconfig.min_IDY >= 95 and not qconfig.is_combined_ref else 'asm10' # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty # -O -- gap penalty, -r -- max gap size mask_level = '1' if qconfig.is_combined_ref else '0.9' num_alignments = '100' if qconfig.is_combined_ref else '50' additional_options = [ '-B5', '-O4,16', '--no-long-join', '-r', str(qconfig.MAX_INDEL_LENGTH), '-N', num_alignments, '-s', str(qconfig.min_alignment), '-z', '200' ] cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \ ['--mask-level', mask_level, '--min-occ', '200', '-g', '2500', '--score-N', '2', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def run_minimap(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads): if qconfig.is_agb_mode: return run_minimap_agb(out_fpath, ref_fpath, contigs_fpath, log_err_fpath, index, max_threads) if qconfig.min_IDY < 90: preset = 'asm20' elif qconfig.min_IDY < 95: preset = 'asm10' else: preset = 'asm5' # -s -- min CIGAR score, -z -- affects how often to stop alignment extension, -B -- mismatch penalty # -O -- gap penalty, -r -- max gap size mask_level = '1' if qconfig.is_combined_ref else '0.9' num_alignments = '100' if qconfig.is_combined_ref else '50' additional_options = ['-B5', '-O4,16', '--no-long-join', '-r', str(qconfig.MAX_INDEL_LENGTH), '-N', num_alignments, '-s', str(qconfig.min_alignment), '-z', '200'] cmdline = [minimap_fpath(), '-c', '-x', preset] + (additional_options if not qconfig.large_genome else []) + \ ['--mask-level', mask_level, '--min-occ', '200', '-g', '2500', '--score-N', '2', '--cs', '-t', str(max_threads), ref_fpath, contigs_fpath] return_code = qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ' + qutils.index_to_str(index)) return return_code
def align_kmers(output_dir, ref_fpath, kmers_fpath, log_err_fpath, max_threads): out_fpath = join(output_dir, 'kmers.coords') cmdline = [ minimap_fpath(), '-cx', 'sr', '-s' + str(qconfig.unique_kmer_len * 2), '--frag=no', '-t', str(max_threads), ref_fpath, kmers_fpath ] qutils.call_subprocess(cmdline, stdout=open(out_fpath, 'w'), stderr=open(log_err_fpath, 'a'), indent=' ') kmers_pos_by_chrom = defaultdict(list) kmers_by_chrom = defaultdict(list) with open(out_fpath) as f: for line in f: fs = line.split('\t') if len(fs) < 10: continue contig, chrom, pos = fs[0], fs[5], fs[7] kmers_pos_by_chrom[chrom].append(int(pos)) kmers_by_chrom[chrom].append(int(contig)) return kmers_by_chrom, kmers_pos_by_chrom
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False): red_genome_dir = os.path.join(tmp_dir, 'tmp_red') if isdir(red_genome_dir): shutil.rmtree(red_genome_dir) os.makedirs(red_genome_dir) ref_name = qutils.name_from_fpath(ref_fpath) ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa') ## Red recognizes only *.fa files if os.path.islink(ref_symlink): os.remove(ref_symlink) os.symlink(ref_fpath, ref_symlink) logger.info(' ' + 'Running repeat masking tool...') repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt') if is_non_empty_file(repeats_fpath): return_code = 0 logger.info(' ' + 'Using existing file ' + repeats_fpath + '...') else: return_code = qutils.call_subprocess([ binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5' ], stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent=' ') if return_code == 0 and repeats_fpath and exists(repeats_fpath): long_repeats_fpath = os.path.join( tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt') with open(long_repeats_fpath, 'w') as out: with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line[1:]) repeats_fasta_fpath = os.path.join( tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta') coords_fpath = os.path.join( tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt') if not is_non_empty_file(coords_fpath): fasta_index_fpath = ref_fpath + '.fai' if exists(fasta_index_fpath): os.remove(fasta_index_fpath) qutils.call_subprocess([ bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed', long_repeats_fpath, '-fo', repeats_fasta_fpath ], stderr=open(log_fpath, 'w'), indent=' ') cmdline = [ minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100', '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath ] qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a')) filtered_repeats_fpath, repeats_regions = check_repeats_instances( coords_fpath, long_repeats_fpath, use_long_reads) unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath) return unique_covered_regions, repeats_regions return None, None