def get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs): refs_len = {} with open(genome_info_fpath, 'r') as report_file: report_file.readline() for line in report_file: if line == '\n' or not line: break lengths = re.findall(r'length: (\d+)', line) if lengths and len(lengths) == 2: line = line.split() refs_len[line[0]] = (lengths[0], lengths[1]) corr_refs = [] for ref_fpath in ref_fpaths: ref_fname = os.path.basename(ref_fpath) ref, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) aligned_len = 0 all_len = 0 for chromosome in chromosomes_by_refs[ref]: if chromosome[0] in refs_len: aligned_len += int(refs_len[chromosome[0]][1]) all_len += int(refs_len[chromosome[0]][0]) if not aligned_len: continue if aligned_len > all_len * qconfig.downloaded_ref_min_aligned_rate: corr_refs.append(ref_fpath) return corr_refs
def get_downloaded_refs_with_alignments(genome_info_fpath, ref_fpaths, chromosomes_by_refs): refs_len = {} with open(genome_info_fpath, 'r') as report_file: report_file.readline() for line in report_file: if line == '\n' or not line: break line = line.split() refs_len[line[0]] = (line[3], line[8]) corr_refs = [] for ref_fpath in ref_fpaths: ref_fname = os.path.basename(ref_fpath) ref, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) aligned_len = 0 all_len = 0 for chromosome in chromosomes_by_refs[ref]: if chromosome[0] in refs_len: aligned_len += int(refs_len[chromosome[0]][1]) all_len += int(refs_len[chromosome[0]][0]) if not aligned_len: continue if aligned_len > all_len * qconfig.downloaded_ref_min_aligned_rate: corr_refs.append(ref_fpath) return corr_refs
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) excluded_ref_fpaths = [] ref_names = qutils.process_labels(ref_fpaths) for ref_fpath, ref_name in zip(ref_fpaths, ref_names): total_references = 0 ref_fname = os.path.basename(ref_fpath) _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a') elif downloaded_refs: logger.warning('Skipping ' + ref_fpath + ' because it' ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!') # cleaning for corr_seq_name, _ in chromosomes_by_refs[ref_name]: del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] del chromosomes_by_refs[ref_name] corrected_ref_fpaths.pop() excluded_ref_fpaths.append(ref_fpath) else: logger.error('Reference file ' + ref_fpath + ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!', exit_with_code=1) for excluded in excluded_ref_fpaths: ref_fpaths.remove(excluded) if len(chromosomes_by_refs) > 0: logger.main_info(' All references were combined in ' + qconfig.combined_ref_name) else: logger.warning('All references were skipped!') return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Generating Upper Bound Assembly...") if not reads_analyzer.compile_reads_analyzer_tools(logger): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...' ) return None if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Upper Bound Assembly on this platform ' '(only linux64 and macOS are supported), skipping...') return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to install/download third-party repeat finding tool [Red]), skipping...' ) return None insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) if long_reads: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, mp_polished_suffix) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size: calculated_insert_size = qconfig.optimal_assembly_insert_size result_fpath = result_fpath.replace('is' + str(insert_size), 'is' + str(calculated_insert_size)) prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace( 'is' + str(insert_size), 'is' + str(calculated_insert_size)) insert_size = calculated_insert_size ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=long_reads) if unique_covered_regions is None: logger.error( ' Failed to create Upper Bound Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_bed( uncovered_fpath) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretical Upper Bound Assembly is saved to ' + result_fpath) logger.notice( '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n' '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). ' 'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n' '\t\tOR\n' '\tYou can copy ' + result_fpath + ' to ' + ref_prepared_optimal_assembly + '. ' 'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference (' + original_ref_fpath + ') and ' 'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size ' + str(insert_size) + '), ' 'QUAST will reuse this Upper Bound Assembly.\n') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) excluded_ref_fpaths = [] ref_names = qutils.process_labels(ref_fpaths) for ref_fpath, ref_name in zip(ref_fpaths, ref_names): total_references = 0 ref_fname = os.path.basename(ref_fpath) _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq( uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a') elif downloaded_refs: logger.warning( 'Skipping ' + ref_fpath + ' because it' ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!' ) # cleaning for corr_seq_name, _ in chromosomes_by_refs[ref_name]: del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] del chromosomes_by_refs[ref_name] corrected_ref_fpaths.pop() excluded_ref_fpaths.append(ref_fpath) else: logger.error( 'Reference file ' + ref_fpath + ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!', exit_with_code=1) for excluded in excluded_ref_fpaths: ref_fpaths.remove(excluded) if len(chromosomes_by_refs) > 0: logger.main_info(' All references were combined in ' + qconfig.combined_ref_name) else: logger.warning('All references were skipped!') return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def split_seq_ext(cls, fname): return qutils.splitext_for_fasta_file(fname)
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Simulating Optimal Assembly...") uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) if os.path.isfile(result_fpath) or os.path.isfile( ref_prepared_optimal_assembly): already_done_fpath = result_fpath if os.path.isfile( result_fpath) else ref_prepared_optimal_assembly logger.notice( ' Will reuse already generated Optimal Assembly with insert size %d (%s)' % (insert_size, already_done_fpath)) return already_done_fpath if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Optimal Assembly on this platform, skipping...' ) return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning(' Sorry, can\'t create Optimal Assembly, skipping...') return None log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath) if unique_covered_regions is None: logger.error( ' Failed to create Optimal Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_uncovered_fpath( uncovered_fpath, ref_fpath, return_covered_regions=False ) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretically optimal Assembly saved to ' + result_fpath) logger.notice( 'You can copy it to ' + ref_prepared_optimal_assembly + ' and QUAST will reuse it in further runs against the same reference (' + original_ref_fpath + ')') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Simulating Ideal Assembly...") uncovered_fpath = None if qconfig.paired_reads or qconfig.reference_sam or qconfig.reference_sam: sam_fpath, uncovered_fpath = reads_analyzer.align_reference(ref_fpath, join(dirname(output_dirpath), qconfig.reads_stats_dirname), using_reads='paired_end') insert_size = qconfig.ideal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.ideal_assembly_default_IS if insert_size % 2 == 0: insert_size += 1 logger.notice(' Current implementation cannot work with even insert sizes, ' 'will use the closest odd value (%d)' % insert_size) ref_basename, fasta_ext = splitext_for_fasta_file(os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % (ref_basename, qconfig.ideal_assembly_basename, insert_size) if qconfig.paired_reads and qconfig.unpaired_reads: result_basename = add_suffix(result_basename, single_polished_suffix) if qconfig.paired_reads and qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file(os.path.basename(original_ref_fpath)) prepared_ideal_assembly_basename = '%s.%s.is%d.fasta' % (original_ref_basename, qconfig.ideal_assembly_basename, insert_size) ref_prepared_ideal_assembly = os.path.join(os.path.dirname(original_ref_fpath), prepared_ideal_assembly_basename) if os.path.isfile(result_fpath) or os.path.isfile(ref_prepared_ideal_assembly): already_done_fpath = result_fpath if os.path.isfile(result_fpath) else ref_prepared_ideal_assembly logger.notice(' Will reuse already generated Ideal Assembly with insert size %d (%s)' % (insert_size, already_done_fpath)) return already_done_fpath if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t create Ideal Assembly on this platform, skipping...') return None base_aux_dir = os.path.join(qconfig.LIBS_LOCATION, 'ideal_assembly') configs_dir = os.path.join(base_aux_dir, 'configs') binary_fpath = download_external_tool('spades', os.path.join(base_aux_dir, 'bin'), 'spades', platform_specific=True) if not os.path.isfile(binary_fpath): logger.warning(' Sorry, can\'t create Ideal Assembly, skipping...') return None log_fpath = os.path.join(output_dirpath, 'spades.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) processed_ref_fpath = preprocess_reference(ref_fpath, tmp_dir, uncovered_fpath) dst_configs = os.path.join(tmp_dir, 'configs') main_config = os.path.join(dst_configs, 'config.info') dir_util._path_created = {} # see http://stackoverflow.com/questions/9160227/dir-util-copy-tree-fails-after-shutil-rmtree dir_util.copy_tree(configs_dir, dst_configs, preserve_times=False) prepare_config_spades(main_config, insert_size, processed_ref_fpath, tmp_dir) log_file = open(log_fpath, 'w') spades_output_fpath = os.path.join(tmp_dir, 'K%d' % insert_size, 'ideal_assembly.fasta') logger.info(' ' + 'Running SPAdes with K=' + str(insert_size) + '...') return_code = qutils.call_subprocess( [binary_fpath, main_config], stdout=log_file, stderr=log_file, indent=' ') if return_code != 0 or not os.path.isfile(spades_output_fpath): logger.error(' Failed to create Ideal Assembly, see log for details: ' + log_fpath) return None if qconfig.mate_pairs or qconfig.unpaired_reads: spades_output_fpath = polish_assembly(ref_fpath, spades_output_fpath, output_dirpath, tmp_dir) shutil.move(spades_output_fpath, result_fpath) logger.info(' ' + 'Ideal Assembly saved to ' + result_fpath) logger.notice('You can copy it to ' + ref_prepared_ideal_assembly + ' and QUAST will reuse it in further runs against the same reference (' + original_ref_fpath + ')') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def correct_meta_references(ref_fpaths, corrected_dirpath): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1] for ref_fpath in ref_fpaths: total_references = 0 ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) if ref_name in dupl_ref_names: ref_name = qutils.get_label_from_par_dir_and_fname(ref_fpath) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') logger.main_info(' All references combined in ' + qconfig.combined_ref_name) return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths