def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess([ tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for ind, seq in read_fasta(fasta_fpath): ind = re.sub('[/. ]', '_', ind) contig_path = os.path.join(base_dir, ind + '.fasta') gff_path = os.path.join(base_dir, ind + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: return None, None, None, None out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff') unique, total = set(), 0 genes = [] cnt = [0] * len(gene_lengths) for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start:end + 1] else: gene_seq = rev_comp(contigs[contig][start:end + 1]) if gene_seq not in unique: unique.add(gene_seq) genes.append((gene_id, gene_seq)) for idx, gene_length in enumerate(gene_lengths): cnt[idx] += end - start > gene_length if OUTPUT_FASTA: out_fasta_path = out_fpath + '_genes.fasta' write_fasta(out_fasta_path, genes) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, len(unique), total, cnt
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for ind, seq in read_fasta(fasta_fpath): contig_path = os.path.join(base_dir, ind + '.fasta') gff_path = os.path.join(base_dir, ind + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(fasta_fpath)) return None, None, None, None out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff') unique, total = set(), 0 genes = [] cnt = [0] * len(gene_lengths) for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start:end + 1] else: gene_seq = rev_comp(contigs[contig][start:end + 1]) if gene_seq not in unique: unique.add(gene_seq) genes.append((gene_id, gene_seq)) for idx, gene_length in enumerate(gene_lengths): cnt[idx] += end - start > gene_length if OUTPUT_FASTA: out_fasta_path = out_fpath + '_genes.fasta' write_fasta(out_fasta_path, genes) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, len(unique), total, cnt
def add_genes_to_fasta(genes, fasta_fpath): def inner(): for i, gene in enumerate(genes): contig_id, strand, left_index, right_index, gene_fasta = gene length = right_index - left_index gene_id = '>gene_%d|GeneMark.hmm|%d_nt|%s|%d|%d|%s' % ( i + 1, length, strand, left_index, right_index, contig_id) yield gene_id, gene_fasta write_fasta(fasta_fpath, inner())
def add_genes_to_fasta(genes, fasta_fpath): def inner(): for i, gene in enumerate(genes): contig_id, strand, left_index, right_index, gene_fasta = gene length = right_index - left_index gene_id = '>gene_%d|GeneMark.hmm|%d_nt|%s|%d|%d|%s' % ( i + 1, length, strand, left_index, right_index, contig_id ) yield gene_id, gene_fasta write_fasta(fasta_fpath, inner())
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) if os.path.exists(alignments_fpath_template % assembly_label): for line in open(alignments_fpath_template % assembly_label): values = line.split() if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys(): ref_name = contigs_analyzer.ref_labels_by_chromosomes[ values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, assembly_label + '_to_' + ref_name[:40] + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ ref_name]: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append( cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template): # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere') # if os.path.isdir(not_aligned_anywhere_dirpath): # os.rmdir(not_aligned_anywhere_dirpath) # os.mkdir(not_aligned_anywhere_dirpath) not_aligned_assemblies = [] # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) for asm in assemblies: not_aligned_fname = asm.name + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() with open(alignments_fpath_template % asm.name) as alignments_tsv_f: for line in alignments_tsv_f: values = line.split() ref_name = values[0] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, asm.name + '_to_' + ref_name[:40] + '.fasta') for (cont_name, seq) in fastaparser.read_fasta(asm.fpath): if not cont_name in contigs.keys(): contigs[cont_name] = seq if cont_name in ref_contigs_names: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, asm.label) assemblies_by_ref[ref_name].append(ref_asm) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) not_aligned_assemblies.append(not_aligned_asm) return assemblies_by_ref, not_aligned_assemblies
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_LENGTH: _, fasta_ext = os.path.splitext(corrected_fpath) splitted_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'splitted_ref') os.makedirs(splitted_ref_dirpath) for i, (chr_name, chr_seq) in enumerate(modified_fasta_entries): if len(chr_seq) > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because it length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue splitted_ref_fpath = os.path.join(splitted_ref_dirpath, "chr_" + str(i + 1)) + fasta_ext qconfig.splitted_ref.append(splitted_ref_fpath) fastaparser.write_fasta(splitted_ref_fpath, [(chr_name, chr_seq)]) if len(qconfig.splitted_ref) == 0: logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.") return False return True
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name
def parallel_partition_contigs(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template): assembly_label = qutils.label_from_fpath(asm.fpath) logger.info(' ' + 'processing ' + assembly_label) added_ref_asm = [] not_aligned_fname = assembly_label + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() aligned_contigs_for_each_ref = {} contigs_seq = fastaparser.read_fasta_one_time(asm.fpath) if os.path.exists(alignments_fpath_template % assembly_label): for line in open(alignments_fpath_template % assembly_label): values = line.split() if values[0] in contigs_analyzer.ref_labels_by_chromosomes.keys(): ref_name = contigs_analyzer.ref_labels_by_chromosomes[values[0]] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, assembly_label + '_to_' + ref_name[:40] + '.fasta') if ref_name not in aligned_contigs_for_each_ref: aligned_contigs_for_each_ref[ref_name] = [] for (cont_name, seq) in contigs_seq: if not cont_name in contigs: contigs[cont_name] = seq if cont_name in ref_contigs_names and cont_name not in aligned_contigs_for_each_ref[ref_name]: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) aligned_contigs_for_each_ref[ref_name].append(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, assembly_label) if ref_asm.name not in added_ref_asm: if ref_name in assemblies_by_ref: assemblies_by_ref[ref_name].append(ref_asm) added_ref_asm.append(ref_asm.name) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) return assemblies_by_ref, not_aligned_asm
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template): # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere') # if os.path.isdir(not_aligned_anywhere_dirpath): # os.rmdir(not_aligned_anywhere_dirpath) # os.mkdir(not_aligned_anywhere_dirpath) not_aligned_assemblies = [] # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) for asm in assemblies: not_aligned_fname = asm.name + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() for line in open(alignments_fpath_template % asm.name): values = line.split() ref_name = values[0] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, asm.name + '_to_' + ref_name[:40] + '.fasta') for (cont_name, seq) in fastaparser.read_fasta(asm.fpath): if not cont_name in contigs.keys(): contigs[cont_name] = seq if cont_name in ref_contigs_names: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, asm.label) assemblies_by_ref[ref_name].append(ref_asm) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) not_aligned_assemblies.append(not_aligned_asm) return assemblies_by_ref, not_aligned_assemblies
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels): broken_scaffolds = None contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, label + fasta_ext)) logs = [] logs.append(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison if qconfig.scaffolds: logger.info( ' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate( fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find( 'N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append( " " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + ' broken')) broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath) else: logs.append( " " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) corr_fpaths = (contigs_fpath, corr_fpath) return corr_fpaths, broken_scaffolds, logs
def _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels): ## removing from contigs' names special characters because: ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names) corrected_contigs_fpaths = [] for i, contigs_fpath in enumerate(contigs_fpaths): contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[i] corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext)) qconfig.assembly_labels_by_fpath[corr_fpath] = label logger.info(' %s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds splitted version of assemblies to the comparison if qconfig.scaffolds: logger.info(" breaking scaffolds into contigs:") corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 for i, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:])) contigs_counter += cur_contig_number fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) qconfig.assembly_labels_by_fpath[broken_scaffolds_fpath] = label + ' broken' logger.info(" %d scaffolds (%s) were broken into %d contigs (%s)" % (i + 1, qutils.name_from_fpath(corr_fpath), contigs_counter, qutils.name_from_fpath(broken_scaffolds_fpath))) if _handle_fasta(broken_scaffolds_fpath, broken_scaffolds_fpath, reporting): corrected_contigs_fpaths.append(broken_scaffolds_fpath) qconfig.list_of_broken_scaffolds.append(qutils.name_from_fpath(broken_scaffolds_fpath)) if _handle_fasta(contigs_fpath, corr_fpath, reporting): corrected_contigs_fpaths.append(corr_fpath) return corrected_contigs_fpaths
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels): broken_scaffolds = None contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext)) logs = [] logs.append(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison if qconfig.scaffolds: logger.info(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append(" " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + ' broken')) broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath) else: logs.append(" " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) corr_fpaths = (contigs_fpath, corr_fpath) return corr_fpaths, broken_scaffolds, logs
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False else: corr_seq = seq modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH: qconfig.splitted_ref = [] # important for MetaQUAST which runs QUAST multiple times _, fasta_ext = os.path.splitext(corrected_fpath) split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref') if os.path.exists(split_ref_dirpath): shutil.rmtree(split_ref_dirpath, ignore_errors=True) os.makedirs(split_ref_dirpath) max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH) cur_part_len = 0 cur_part_num = 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext for (chr_name, chr_seq) in modified_fasta_entries: cur_chr_len = len(chr_seq) if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue cur_part_len += cur_chr_len if cur_part_len > max_len and cur_part_len != cur_chr_len: qconfig.splitted_ref.append(cur_part_fpath) cur_part_len = cur_chr_len cur_part_num += 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a') if cur_part_len > 0: qconfig.splitted_ref.append(cur_part_fpath) if len(qconfig.splitted_ref) == 0: logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.") return False return True
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning( 'Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False else: corr_seq = seq modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum( len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH: qconfig.splitted_ref = [ ] # important for MetaQUAST which runs QUAST multiple times _, fasta_ext = os.path.splitext(corrected_fpath) split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref') if os.path.exists(split_ref_dirpath): shutil.rmtree(split_ref_dirpath, ignore_errors=True) os.makedirs(split_ref_dirpath) max_len = min(ref_len / qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH) cur_part_len = 0 cur_part_num = 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext for (chr_name, chr_seq) in modified_fasta_entries: cur_chr_len = len(chr_seq) if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue cur_part_len += cur_chr_len if cur_part_len > max_len and cur_part_len != cur_chr_len: qconfig.splitted_ref.append(cur_part_fpath) cur_part_len = cur_chr_len cur_part_num += 1 cur_part_fpath = os.path.join( split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a') if cur_part_len > 0: qconfig.splitted_ref.append(cur_part_fpath) if len(qconfig.splitted_ref) == 0: logger.warning( "Skipping reference because all of its chromosomes exceeded Nucmer's constraint." ) return False return True