def _correct_refrences(ref_fpaths, corrected_dirpath): common_ref_fasta_ext = '' corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME) def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name for ref_fpath in ref_fpaths: total_references = 0 for _ in fastaparser.read_fasta(ref_fpath): total_references += 1 if total_references > 1: logger.info(' ' + ref_fpath + ':') ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) common_ref_fasta_ext = ref_fasta_ext for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): corr_seq_name = correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references) if total_references > 1: logger.info(' ' + corr_seq_name + '\n') else: logger.info(' ' + ref_fpath + ' ==> ' + corr_seq_name + '') logger.info(' All references combined in ' + COMBINED_REF_FNAME) return corrected_ref_fpaths, common_ref_fasta_ext, combined_ref_fpath
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath): tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') heu_dirpath = os.path.join(tool_dirpath, 'heuristic_mod') tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) for ind, seq in read_fasta(fasta_fpath): gc = min(70, max(30, gc_content(seq))) gc = gc - gc % 5 # rounds to a divisible by 5 current_fname = str(gc) + '.fasta' current_fpath = os.path.join(tmp_dirpath, current_fname) with open(current_fpath, 'a') as current_file: current_file.write('>' + ind + '\n' + seq + '\n') genes = [] _, _, fnames = os.walk(tmp_dirpath).next() for fname in fnames: sub_fasta_fpath = os.path.join(tmp_dirpath, fname) out_fpath = sub_fasta_fpath + '.gmhmm' gc_str, ext = os.path.splitext(fname) heu_fpath = os.path.join(heu_dirpath, 'heu_11_' + gc_str + '.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, sub_fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, arcs=False, similar=False, coverage_hist=None): lists_of_aligned_blocks = [] total_genome_size = 0 reference_chromosomes = dict() for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = int(0.1 * total_genome_size) sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] for length in sorted(reference_chromosomes.values(), reverse=True): cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length) virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift for contigs_fpath in contigs_fpaths: report_fpath = contig_report_fpath_pattern % qutils.name_from_fpath(contigs_fpath) aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths) if aligned_blocks is None: return None lists_of_aligned_blocks.append(aligned_blocks) plot_fpath = draw_alignment_plot( contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath, lists_of_aligned_blocks, arcs, similar, coverage_hist) return plot_fpath
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess([ tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for ind, seq in read_fasta(fasta_fpath): ind = re.sub('[/. ]', '_', ind) contig_path = os.path.join(base_dir, ind + '.fasta') gff_path = os.path.join(base_dir, ind + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: return None, None, None, None out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff') unique, total = set(), 0 genes = [] cnt = [0] * len(gene_lengths) for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start:end + 1] else: gene_seq = rev_comp(contigs[contig][start:end + 1]) if gene_seq not in unique: unique.add(gene_seq) genes.append((gene_id, gene_seq)) for idx, gene_length in enumerate(gene_lengths): cnt[idx] += end - start > gene_length if OUTPUT_FASTA: out_fasta_path = out_fpath + '_genes.fasta' write_fasta(out_fasta_path, genes) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, len(unique), total, cnt
def _correct_refrences(ref_fpaths, corrected_dirpath): common_ref_fasta_ext = '' corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME) def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name for ref_fpath in ref_fpaths: total_references = 0 for _ in fastaparser.read_fasta(ref_fpath): total_references += 1 if total_references > 1: logger.info(' ' + ref_fpath + ':') ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) common_ref_fasta_ext = ref_fasta_ext for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): corr_seq_name = correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references) if total_references > 1: logger.info(' ' + corr_seq_name + '\n') else: logger.info(' ' + ref_fpath + ' ==> ' + corr_seq_name + '') logger.info(' All references combined in ' + COMBINED_REF_FNAME) return corrected_ref_fpaths, common_ref_fasta_ext, combined_ref_fpath
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for ind, seq in read_fasta(fasta_fpath): contig_path = os.path.join(base_dir, ind + '.fasta') gff_path = os.path.join(base_dir, ind + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(fasta_fpath)) return None, None, None, None out_gff_path = merge_gffs(gffs, out_fpath + '_genes.gff') unique, total = set(), 0 genes = [] cnt = [0] * len(gene_lengths) for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start:end + 1] else: gene_seq = rev_comp(contigs[contig][start:end + 1]) if gene_seq not in unique: unique.add(gene_seq) genes.append((gene_id, gene_seq)) for idx, gene_length in enumerate(gene_lengths): cnt[idx] += end - start > gene_length if OUTPUT_FASTA: out_fasta_path = out_fpath + '_genes.fasta' write_fasta(out_fasta_path, genes) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, len(unique), total, cnt
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None, arcs=False, similar=False, coverage_hist=None): make_output_dir(output_dirpath) lists_of_aligned_blocks = [] total_genome_size = 0 reference_chromosomes = dict() chr_names = [] for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_names.append(chr_name) chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = 100 sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] for length in sorted(reference_chromosomes.values(), reverse=True): cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length) virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift for contigs_fpath in contigs_fpaths: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname( contigs_fpath) aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths) if aligned_blocks is None: return None for block in aligned_blocks: block.label = qutils.name_from_fpath(contigs_fpath) lists_of_aligned_blocks.append(aligned_blocks) plot_fpath, assemblies = draw_alignment_plot( contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath, lists_of_aligned_blocks, arcs, similar, coverage_hist) if assemblies and qconfig.create_contig_alignment_html: js_data_gen(assemblies, contigs_fpaths, chr_names, reference_chromosomes, output_dirpath, cov_fpath, ref_fpath, virtual_genome_size) return plot_fpath
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template): # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere') # if os.path.isdir(not_aligned_anywhere_dirpath): # os.rmdir(not_aligned_anywhere_dirpath) # os.mkdir(not_aligned_anywhere_dirpath) not_aligned_assemblies = [] # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) for asm in assemblies: not_aligned_fname = asm.name + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() with open(alignments_fpath_template % asm.name) as alignments_tsv_f: for line in alignments_tsv_f: values = line.split() ref_name = values[0] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, asm.name + '_to_' + ref_name[:40] + '.fasta') for (cont_name, seq) in fastaparser.read_fasta(asm.fpath): if not cont_name in contigs.keys(): contigs[cont_name] = seq if cont_name in ref_contigs_names: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, asm.label) assemblies_by_ref[ref_name].append(ref_asm) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) not_aligned_assemblies.append(not_aligned_asm) return assemblies_by_ref, not_aligned_assemblies
def hasScaffolds(self, assembler): result = self._has_scaffolds_cache.get(assembler) if result is not None: return result result = False if assembler == "spades": scaffolds_fn = self._info[assembler]['scaffolds'] for name, seq in read_fasta(scaffolds_fn): if 'N' in seq: result = True break self._has_scaffolds_cache[assembler] = result return result
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_LENGTH: _, fasta_ext = os.path.splitext(corrected_fpath) splitted_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'splitted_ref') os.makedirs(splitted_ref_dirpath) for i, (chr_name, chr_seq) in enumerate(modified_fasta_entries): if len(chr_seq) > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because it length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue splitted_ref_fpath = os.path.join(splitted_ref_dirpath, "chr_" + str(i + 1)) + fasta_ext qconfig.splitted_ref.append(splitted_ref_fpath) fastaparser.write_fasta(splitted_ref_fpath, [(chr_name, chr_seq)]) if len(qconfig.splitted_ref) == 0: logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.") return False return True
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template): # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere') # if os.path.isdir(not_aligned_anywhere_dirpath): # os.rmdir(not_aligned_anywhere_dirpath) # os.mkdir(not_aligned_anywhere_dirpath) not_aligned_assemblies = [] # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) for asm in assemblies: not_aligned_fname = asm.name + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() for line in open(alignments_fpath_template % asm.name): values = line.split() ref_name = values[0] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, asm.name + '_to_' + ref_name[:40] + '.fasta') for (cont_name, seq) in fastaparser.read_fasta(asm.fpath): if not cont_name in contigs.keys(): contigs[cont_name] = seq if cont_name in ref_contigs_names: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, asm.label) assemblies_by_ref[ref_name].append(ref_asm) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) not_aligned_assemblies.append(not_aligned_asm) return assemblies_by_ref, not_aligned_assemblies
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, arcs=False, similar=False, coverage_hist=None): lists_of_aligned_blocks = [] total_genome_size = 0 reference_chromosomes = dict() for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = int(0.1 * total_genome_size) sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] for length in sorted(reference_chromosomes.values(), reverse=True): cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length) virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift for contigs_fpath in contigs_fpaths: report_fpath = contig_report_fpath_pattern % qutils.name_from_fpath( contigs_fpath) aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths) if aligned_blocks is None: return None lists_of_aligned_blocks.append(aligned_blocks) plot_fpath = draw_alignment_plot(contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath, lists_of_aligned_blocks, arcs, similar, coverage_hist) return plot_fpath
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None, arcs=False, similar=False, coverage_hist=None): make_output_dir(output_dirpath) lists_of_aligned_blocks = [] total_genome_size = 0 reference_chromosomes = dict() chr_names = [] for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_names.append(chr_name) chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = 100 sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] for length in sorted(reference_chromosomes.values(), reverse=True): cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length) virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift for contigs_fpath in contigs_fpaths: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths) if aligned_blocks is None: return None for block in aligned_blocks: block.label = qutils.name_from_fpath(contigs_fpath) lists_of_aligned_blocks.append(aligned_blocks) plot_fpath, assemblies = draw_alignment_plot( contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath, lists_of_aligned_blocks, arcs, similar, coverage_hist) if assemblies and qconfig.create_contig_alignment_html: js_data_gen(assemblies, contigs_fpaths, chr_names, reference_chromosomes, output_dirpath, cov_fpath, ref_fpath, virtual_genome_size) return plot_fpath
REF_MARGINS = 300 REF_FNAME = "ref.fa" if len(sys.argv) != 4: print "Usage:", sys.argv[0], "reference pos1 pos2" sys.exit(0) pos1 = int(sys.argv[2]) pos2 = int(sys.argv[3]) if pos1 > pos2: pos = pos1 pos1 = pos2 pos2 = pos reference = fastaparser.read_fasta( sys.argv[1])[0][1] # Returns list of FASTA entries (in tuples: name, seq) if len(reference) < pos2: pos2 = len(reference) ref_file = open(REF_FNAME, 'w') ref_file.write(">reference\n") ref_file.write(reference[max(0, pos1 - 1 - REF_MARGINS):min(len(reference), pos2 + REF_MARGINS)] + "\n") ref_file.close() misassembled_site = reference[pos1 - 1:pos2] kmers = set() i = pos1 - 1 while i + KMER_SIZE <= pos2:
def _correct_references(ref_fpaths, corrected_dirpath): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME) chromosomes_by_refs = {} def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) dupl_ref_names = [ ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1 ] for ref_fpath in ref_fpaths: total_references = 0 ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) if ref_name in dupl_ref_names: ref_name = get_label_from_par_dir_and_fname(ref_fpath) chromosomes_by_refs[ref_name] = [] corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 corr_seq_name, corr_seq_fpath = correct_seq( seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') logger.main_info(' All references combined in ' + COMBINED_REF_FNAME) return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
print("Usage: " + sys.argv[0] + " <input fasta (scaffolds)> (to get stats on sizes of Ns regions)") print("Usage: " + sys.argv[0] + " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)") sys.exit() BREAK_SCAFFOLDS = False if len(sys.argv) == 4: BREAK_SCAFFOLDS = True N_NUMBER = None counter = 0 if BREAK_SCAFFOLDS: N_NUMBER = int(sys.argv[2]) sizes_of_Ns_regions = dict() new_fasta = [] for id, (name, seq) in enumerate(fastaparser.read_fasta(sys.argv[1])): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER: new_fasta.append((name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end
import sys import os sys.path.append(os.path.join(os.path.abspath(sys.path[0]), '../')) import libs from libs import fastaparser if len(sys.argv) <= 3 or len(sys.argv) >= 6: print("Returns [reverse-complement] sequence from START to END position from each entry of input fasta") print("Usage: " + sys.argv[0] + " <input fasta> <START> <END, -1 for the end> [any string -- optional parameter for reverse-complement]") sys.exit() inp=sys.argv[1] start=int(sys.argv[2]) end=int(sys.argv[3]) reverse = False if len(sys.argv) == 5: reverse = True for tup in fastaparser.read_fasta(inp): cur_start = min(start, len(tup[1])) if end == -1: cur_end = len(tup[1]) else: cur_end = min(end, len(tup[1])) print (">" + tup[0] + "_cropped_" + str(cur_start) + "_" + str(cur_end)) if reverse: print (fastaparser.rev_comp(tup[1][cur_start - 1 : cur_end])) else: print (tup[1][cur_start - 1 : cur_end])
# MAIN if len(sys.argv) != 3: print("Usage: " + sys.argv[0] + " <input fasta> <contig id or file with list of contig ids>") sys.exit() if os.path.isfile(sys.argv[2]): list_of_ids = [] for line in open(sys.argv[2]): list_of_ids.append(line.strip()) else: list_of_ids = [sys.argv[2]] origin_fasta = fastaparser.read_fasta(sys.argv[1]) dict_of_all_contigs = dict() selected_contigs = [] for (name, seq) in origin_fasta: corr_name = get_corr_name(name) dict_of_all_contigs[corr_name] = seq for name in list_of_ids: corr_name = get_corr_name(name) if corr_name in dict_of_all_contigs: selected_contigs.append((name, dict_of_all_contigs[corr_name])) else: print >> sys.stderr, "Contig", name, "(cor name:", corr_name, ") not found!" for (name, seq) in selected_contigs: print '>' + name
REF_MARGINS = 300 REF_FNAME = "ref.fa" if len(sys.argv) != 4: print "Usage:", sys.argv[0], "reference pos1 pos2" sys.exit(0) pos1 = int(sys.argv[2]) pos2 = int(sys.argv[3]) if pos1 > pos2: pos = pos1 pos1 = pos2 pos2 = pos reference = fastaparser.read_fasta(sys.argv[1])[0][1] # Returns list of FASTA entries (in tuples: name, seq) if len(reference) < pos2: pos2 = len(reference) ref_file = open(REF_FNAME, 'w') ref_file.write(">reference\n") ref_file.write(reference[max(0, pos1 - 1 - REF_MARGINS) : min(len(reference), pos2 + REF_MARGINS)] + "\n") ref_file.close() misassembled_site = reference[pos1 - 1 : pos2] kmers = set() i = pos1 - 1 while i + KMER_SIZE <= pos2: kmers.add(reference[i : i + KMER_SIZE]) i += 1
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels): broken_scaffolds = None contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, label + fasta_ext)) logs = [] logs.append(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison if qconfig.scaffolds: logger.info( ' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate( fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find( 'N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append( " " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + ' broken')) broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath) else: logs.append( " " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) corr_fpaths = (contigs_fpath, corr_fpath) return corr_fpaths, broken_scaffolds, logs
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False else: corr_seq = seq modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH: qconfig.splitted_ref = [] # important for MetaQUAST which runs QUAST multiple times _, fasta_ext = os.path.splitext(corrected_fpath) split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref') if os.path.exists(split_ref_dirpath): shutil.rmtree(split_ref_dirpath, ignore_errors=True) os.makedirs(split_ref_dirpath) max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH) cur_part_len = 0 cur_part_num = 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext for (chr_name, chr_seq) in modified_fasta_entries: cur_chr_len = len(chr_seq) if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue cur_part_len += cur_chr_len if cur_part_len > max_len and cur_part_len != cur_chr_len: qconfig.splitted_ref.append(cur_part_fpath) cur_part_len = cur_chr_len cur_part_num += 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a') if cur_part_len > 0: qconfig.splitted_ref.append(cur_part_fpath) if len(qconfig.splitted_ref) == 0: logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.") return False return True
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels): broken_scaffolds = None contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext)) logs = [] logs.append(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison if qconfig.scaffolds: logger.info(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append(" " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + ' broken')) broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath) else: logs.append(" " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) corr_fpaths = (contigs_fpath, corr_fpath) return corr_fpaths, broken_scaffolds, logs
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] for first_line, seq in fastaparser.read_fasta(original_fpath): if (len(seq) >= min_contig) or is_reference: corr_name = qutils.correct_name(first_line) if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = seq.upper() # correcting alternatives (gage can't work with alternatives) # dic = {'M': 'A', 'K': 'G', 'R': 'A', 'Y': 'C', 'W': 'A', 'S': 'C', 'V': 'A', 'B': 'C', 'H': 'A', 'D': 'A'} dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) # make sure that only A, C, G, T or N are in the sequence if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning( 'Skipping ' + original_fpath + ' because it contains non-ACGTN characters.', indent=' ') return False else: corr_seq = seq modified_fasta_entries.append((corr_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum( len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH: qconfig.splitted_ref = [ ] # important for MetaQUAST which runs QUAST multiple times _, fasta_ext = os.path.splitext(corrected_fpath) split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref') if os.path.exists(split_ref_dirpath): shutil.rmtree(split_ref_dirpath, ignore_errors=True) os.makedirs(split_ref_dirpath) max_len = min(ref_len / qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH) cur_part_len = 0 cur_part_num = 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext for (chr_name, chr_seq) in modified_fasta_entries: cur_chr_len = len(chr_seq) if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue cur_part_len += cur_chr_len if cur_part_len > max_len and cur_part_len != cur_chr_len: qconfig.splitted_ref.append(cur_part_fpath) cur_part_len = cur_chr_len cur_part_num += 1 cur_part_fpath = os.path.join( split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a') if cur_part_len > 0: qconfig.splitted_ref.append(cur_part_fpath) if len(qconfig.splitted_ref) == 0: logger.warning( "Skipping reference because all of its chromosomes exceeded Nucmer's constraint." ) return False return True
def _correct_references(ref_fpaths, corrected_dirpath): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME) chromosomes_by_refs = {} def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1] for ref_fpath in ref_fpaths: total_references = 0 ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) if ref_name in dupl_ref_names: ref_name = get_label_from_par_dir_and_fname(ref_fpath) chromosomes_by_refs[ref_name] = [] corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 corr_seq_name, corr_seq_fpath = correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') logger.main_info(' All references combined in ' + COMBINED_REF_FNAME) return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels): ## removing from contigs' names special characters because: ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names) corrected_contigs_fpaths = [] for i, contigs_fpath in enumerate(contigs_fpaths): contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[i] corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext)) qconfig.assembly_labels_by_fpath[corr_fpath] = label logger.info(' %s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds splitted version of assemblies to the comparison if qconfig.scaffolds: logger.info(" breaking scaffolds into contigs:") corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 for i, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:])) contigs_counter += cur_contig_number fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) qconfig.assembly_labels_by_fpath[broken_scaffolds_fpath] = label + ' broken' logger.info(" %d scaffolds (%s) were broken into %d contigs (%s)" % (i + 1, qutils.name_from_fpath(corr_fpath), contigs_counter, qutils.name_from_fpath(broken_scaffolds_fpath))) if _handle_fasta(broken_scaffolds_fpath, broken_scaffolds_fpath, reporting): corrected_contigs_fpaths.append(broken_scaffolds_fpath) qconfig.list_of_broken_scaffolds.append(qutils.name_from_fpath(broken_scaffolds_fpath)) if _handle_fasta(contigs_fpath, corr_fpath, reporting): corrected_contigs_fpaths.append(corr_fpath) return corrected_contigs_fpaths
return qutils.correct_name(name) # return re.sub(r'\W', '', re.sub(r'\s', '_', name)) # MAIN if len(sys.argv) != 3: print("Usage: " + sys.argv[0] + " <input fasta> <contig id or file with list of contig ids>") sys.exit() if os.path.isfile(sys.argv[2]): list_of_ids = [] for line in open(sys.argv[2]): list_of_ids.append(line.strip()) else: list_of_ids = [sys.argv[2]] origin_fasta = fastaparser.read_fasta(sys.argv[1]) dict_of_all_contigs = dict() selected_contigs = [] for (name, seq) in origin_fasta: corr_name = get_corr_name(name) dict_of_all_contigs[corr_name] = seq for name in list_of_ids: corr_name = get_corr_name(name) if corr_name in dict_of_all_contigs: selected_contigs.append((name, dict_of_all_contigs[corr_name])) else: print >> sys.stderr, "Contig", name, "(cor name:", corr_name, ") not found!" for (name, seq) in selected_contigs: print '>' + name