def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) alignments_fpath = alignments_fpath_template % corr_assembly_label if os.path.exists(alignments_fpath): with open(alignments_fpath) as f: for line in f: values = line.split() if values[ 0] in contigs_analyzer.ref_labels_by_chromosomes.keys( ): ref_name = contigs_analyzer.ref_labels_by_chromosomes[ values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, corr_assembly_label + '_to_' + ref_name + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ ref_name]: # Collecting all aligned contigs names in order to further extract not aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append( cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) if qconfig.space_efficient: os.remove(alignments_fpath) # Extraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = corr_assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) alignments_fpath = alignments_fpath_template % corr_assembly_label if os.path.exists(alignments_fpath): with open(alignments_fpath) as f: for line in f: values = line.split() if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys(): ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, corr_assembly_label + '_to_' + ref_name + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]: # Collecting all aligned contigs names in order to further extract not aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) if qconfig.space_efficient: os.remove(alignments_fpath) # Extraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm