def mauve_pw_align(ref, query, dirs): """Set up and perform a pairwise alignment with Mauve.""" # set outputs mauve_outfile = dirs['mauve']+ref.name+"_"+query.name+".mauve" segfile = dirs['aln_segs']+ref.name+"_"+query.name+"_segs.txt" # check for existing alignment if path.exists(segfile): print "already done" else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist files sslist_files = from_dir(dirs['seqfiles'], re.compile(r'.*\.sslist.*')) for sslist in sslist_files: try: os.remove(dirs['seqfiles']+sslist) except Exception: raise # do Mauve alignment file_list = [ref.gbk, query.gbk] align_mauve(file_list, mauve_outfile) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0) print "\nSegment results:", len(coords), '->', # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode) print len(chop_array), 'segments <', max_size, 'bp' # make detailed pairwise alignments of the segments print "Aligning segments ..." ref_rec = load_genbank(ref.gbk) query_rec = load_genbank(query.gbk) id = iter_align(chop_array, ref_rec, query_rec, dirs['aln_segs'], segfile) print "Results:", id, "% id. overall" except IOError: print "\nERROR: Mauve alignment failed"
def mauve_pw_align(ref, query, r_root_dir, g_root_dir, dirs, run, max_size, chop_mode, mauve_exec, mtype): """Set up and perform a pairwise alignment with Mauve.""" aln_dir = r_root_dir + run + dirs['aln_segs'] mauve_dir = r_root_dir + run + dirs['mauve'] # set outputs mauve_outfile = mauve_dir + ref.name + "_" + query.name + ".mauve" segfile = aln_dir + ref.name + "_" + query.name + "_segs.txt" # check for existing alignment if path.exists(segfile): print "already done" else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist files sslist_files = from_dir(g_root_dir, re.compile(r'.*\.sslist.*')) for sslist in sslist_files: try: os.remove(g_root_dir + sslist) except Exception: raise # do Mauve alignment file_list = [ref.gbk, query.gbk] align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype) print "\nSegment results:", len(coords), '->', # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) print len(chop_array), 'segments <', max_size, 'bp' # make detailed pairwise alignments of the segments print "Aligning segments ..." ref_rec = load_genbank(ref.gbk) query_rec = load_genbank(query.gbk) id = iter_align(chop_array, ref_rec, query_rec, aln_dir, segfile) print "Results:", id, "% id. overall" except IOError: print "\nERROR: Mauve alignment failed" raise
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ctgs_root = run_root + run_dirs['run_gbk_ctgs_dir'] + ref_n + "/" mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/" scaffolds_dir = run_root + run_dirs['scaffolds_dir'] + ref_n + "/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root + g_name + "/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root + g_name + "/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.fas" scaff_gbk = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "(" + ctg_num + ")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "(" + ctg_num + ")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir + ctg_num + ".mauve" bb_file = mauve_file + ".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert( anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir + g_name + "_" + str( ctg_num) + ".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name + " scaffold from " + ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/" mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root+g_name+"/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root+g_name+"/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas" scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "("+ctg_num+")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "("+ctg_num+")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir+ctg_num+".mauve" bb_file = mauve_file+".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert(anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name+" scaffold from "+ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype): """Align contigs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ref_ctg_file = run_ref.file mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/" segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/contigs/" q_ctgs_root = run_root + run_dirs['match_out_dir'] + ref_n + "/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs and outputs g_name = genome['name'] ctgs_fas_dir = q_ctgs_root + g_name + "/" mauve_dir = mauve_root + g_name + "/" aln_segs_root = segments_root + g_name + "/" ensure_dir([mauve_dir]) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # list genbank files in matches directory dir_contents = listdir(ctgs_fas_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs q_contig = ctgs_fas_dir + item file_list = (ref_ctg_file, q_contig) mauve_outfile = mauve_dir + ctg_num + ".mauve" aln_segs_dir = aln_segs_root + ctg_num + "/" ensure_dir([aln_segs_dir]) segfile = aln_segs_dir + ctg_num + "_" + ref_n + "_segs.txt" open(segfile, 'w').write('') # do Mauve alignment try: open(ref_ctg_file, 'r') open(q_contig, 'r') except IOError: msg = "\nERROR: File missing, cannot align\n\t\t\t" run_ref.log(msg) print msg else: align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_fasta(q_contig) iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) except IOError: msg = "\nERROR: Mauve alignment failed\n\t\t\t" run_ref.log(msg) print msg except Exception: msg = "\nERROR: Iteration failed\n\t\t\t" run_ref.log(msg) print msg print ""
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, max_size, chop_mode, mtype, mauve_exec): """Align constructs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ref_ctg_file = run_ref.file mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/constructs/" segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/constructs/" scaff_root = run_root + run_dirs['scaffolds_dir'] + ref_n + "/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join( ["\n\n# Align scaffold constructs to reference @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] scaff_gbk = scaff_root + g_name + "_" + ref_n + "_scaffold.gbk" file_list = (ref_ctg_file, scaff_gbk) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root + g_name + "/" aln_segs_dir = segments_root + g_name + "/" ensure_dir([mauve_dir, aln_segs_dir]) mauve_outfile = mauve_dir + g_name + "_" + ref_n + ".mauve" segfile = aln_segs_dir + g_name + "_" + ref_n + "_segs.txt" # abort if the reference file is not found try: open(ref_ctg_file, 'r') except IOError: msg = "ERROR: Reference file not found" print msg run_ref.log(msg) raise # abort if there is no scaffold construct try: open(scaff_gbk, 'r') except IOError: msg = "WARNING: No scaffold construct to align" print msg run_ref.log(msg) else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist file sslist_file = scaff_gbk + ".sslist" if os.path.isfile(sslist_file): try: os.remove(sslist_file) except Exception: raise # do Mauve alignment align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype) print len(coords), '->', logstring = "".join(["\t", str(len(coords))]) run_ref.log(logstring) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) print len(chop_array), 'segments <', max_size, 'bp', logstring = "".join(["\t", str(len(chop_array))]) run_ref.log(logstring) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_genbank(scaff_gbk) id = iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) print "@", id, "% id. overall" logstring = "".join(["\t", str(id)]) run_ref.log(logstring) except IOError: msg = "\nERROR: Mauve alignment failed" run_ref.log(msg) print msg
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype): """Align contigs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ref_ctg_file = run_ref.file mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/contigs/" q_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs and outputs g_name = genome['name'] ctgs_fas_dir = q_ctgs_root+g_name+"/" mauve_dir = mauve_root+g_name+"/" aln_segs_root = segments_root+g_name+"/" ensure_dir([mauve_dir]) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # list genbank files in matches directory dir_contents = listdir(ctgs_fas_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs q_contig = ctgs_fas_dir+item file_list = (ref_ctg_file, q_contig) mauve_outfile = mauve_dir+ctg_num+".mauve" aln_segs_dir = aln_segs_root+ctg_num+"/" ensure_dir([aln_segs_dir]) segfile = aln_segs_dir+ctg_num+"_"+ref_n+"_segs.txt" open(segfile, 'w').write('') # do Mauve alignment try: open(ref_ctg_file, 'r') open(q_contig, 'r') except IOError: msg = "\nERROR: File missing, cannot align\n\t\t\t" run_ref.log(msg) print msg else: align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_fasta(q_contig) iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) except IOError: msg = "\nERROR: Mauve alignment failed\n\t\t\t" run_ref.log(msg) print msg except Exception: msg = "\nERROR: Iteration failed\n\t\t\t" run_ref.log(msg) print msg print ""
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, max_size, chop_mode, mtype, mauve_exec): """Align constructs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ref_ctg_file = run_ref.file mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/constructs/" segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/constructs/" scaff_root = run_root+run_dirs['scaffolds_dir']+ref_n+"/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align scaffold constructs to reference @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] scaff_gbk = scaff_root+g_name+"_"+ref_n+"_scaffold.gbk" file_list = (ref_ctg_file, scaff_gbk) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root+g_name+"/" aln_segs_dir = segments_root+g_name+"/" ensure_dir([mauve_dir, aln_segs_dir]) mauve_outfile = mauve_dir+g_name+"_"+ref_n+".mauve" segfile = aln_segs_dir+g_name+"_"+ref_n+"_segs.txt" # abort if the reference file is not found try: open(ref_ctg_file, 'r') except IOError: msg = "ERROR: Reference file not found" print msg run_ref.log(msg) raise # abort if there is no scaffold construct try: open(scaff_gbk, 'r') except IOError: msg = "WARNING: No scaffold construct to align" print msg run_ref.log(msg) else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist file sslist_file = scaff_gbk+".sslist" if os.path.isfile(sslist_file): try: os.remove(sslist_file) except Exception: raise # do Mauve alignment align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype) print len(coords), '->', logstring = "".join(["\t", str(len(coords))]) run_ref.log(logstring) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) print len(chop_array), 'segments <', max_size, 'bp', logstring = "".join(["\t", str(len(chop_array))]) run_ref.log(logstring) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_genbank(scaff_gbk) id = iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) print "@", id, "% id. overall" logstring = "".join(["\t", str(id)]) run_ref.log(logstring) except IOError: msg = "\nERROR: Mauve alignment failed" run_ref.log(msg) print msg