def mauve_pw_align(ref, query, dirs): """Set up and perform a pairwise alignment with Mauve.""" # set outputs mauve_outfile = dirs['mauve']+ref.name+"_"+query.name+".mauve" segfile = dirs['aln_segs']+ref.name+"_"+query.name+"_segs.txt" # check for existing alignment if path.exists(segfile): print "already done" else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist files sslist_files = from_dir(dirs['seqfiles'], re.compile(r'.*\.sslist.*')) for sslist in sslist_files: try: os.remove(dirs['seqfiles']+sslist) except Exception: raise # do Mauve alignment file_list = [ref.gbk, query.gbk] align_mauve(file_list, mauve_outfile) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0) print "\nSegment results:", len(coords), '->', # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode) print len(chop_array), 'segments <', max_size, 'bp' # make detailed pairwise alignments of the segments print "Aligning segments ..." ref_rec = load_genbank(ref.gbk) query_rec = load_genbank(query.gbk) id = iter_align(chop_array, ref_rec, query_rec, dirs['aln_segs'], segfile) print "Results:", id, "% id. overall" except IOError: print "\nERROR: Mauve alignment failed"
def annot_ref(ref_name, ctg_fas, prot_db_name, fixed_dirs, project_id, blast_prefs): """Annotate reference contig (predict ORFs and assign function).""" # locate the COG database prot_db = fixed_dirs['ref_dbs_dir']+prot_db_name # set inputs and outputs g_gbk_ctgs_root = fixed_dirs['gbk_contigs_dir']+ref_name+"/" ctg_cds_root = fixed_dirs['ctg_cds_dir']+ref_name+"/" ctg_prot_root = fixed_dirs['ctg_prot_dir']+ref_name+"/" ctg_blast_root = fixed_dirs['ctg_blast_dir']+ref_name+"/" annot_trn_root = fixed_dirs['annot_trn_dir'] ensure_dir([g_gbk_ctgs_root, ctg_cds_root, ctg_prot_root, ctg_blast_root, annot_trn_root]) trn_file = annot_trn_root+ref_name+"_annot.trn" g_ctg_gbk = g_gbk_ctgs_root+ref_name+"_1.gbk" annot_gbk = ctg_cds_root+ref_name+"_1_cds.gbk" annot_aa = ctg_prot_root+ref_name+"_1_aa.fas" blast_out = ctg_blast_root+ref_name+"_1.xml" if path.exists(blast_out) and os.stat(blast_out)[6]==0: os.remove(blast_out) if not path.exists(g_ctg_gbk): l_tag_base = ref_name+"_1" record = annot_ctg(ctg_fas, ctg_fas, annot_gbk, annot_aa, trn_file, prot_db, blast_out, l_tag_base, blast_prefs) record.description = ref_name+"_re-annotated" record.name = ref_name+"_1" record.dbxrefs = ["Project: "+project_id+"/"+ref_name +"-like backbones"] record.seq.alphabet = generic_dna write_genbank(g_ctg_gbk, record) else: record = load_genbank(g_ctg_gbk) return record
def ContigDraw(cName, in_file, out_file): """Draw sequence map of a single contig to file.""" # load contig record seq_record = load_genbank(in_file) ctg_length = len(seq_record.seq) features = seq_record.features feat_cnt = len(features) # calculate main canvas dimensions print "\tcalculating canvas dimensions" if ctg_length < 25000: hCan = 32*cm else: hCan = hmar*2 + pNsize + ctg_length*u vCan = dBL + vmar + feat_cnt*ck_vsp transX = hmar + pNsize transY = dBL + vmar/2 + feat_cnt*ck_vsp # set up main canvas canvas_main = Canvasser(hCan, vCan, transX, transY, out_file) print "\tdrawing contig baselines and features" # draw contig baseline and features BaseDraw(canvas_main, cName, ctg_length, features) # draw scale SeqScale(canvas_main, scX, incrT, incrN, dip, dop ) # write to file and finalize the figure canvas_main.showPage() canvas_main.save() print "OK"
def gbk2fas(gbk_file): """Convert a Genban file to kFastA format.""" record = load_genbank(gbk_file) fas_file = gbk_file[:gbk_file.find('.gbk')]+'.fas' # record.name = rec_name # record.id = rec_name write_fasta(fas_file, record) return fas_file
def get_segs_from_feats(self, feat_type): feats = [feat for feat in load_genbank(self.gbk).features if feat.type == feat_type] counter = 0 for feat in feats: # TODO: there must be a better way to do this !!! counter +=1 a = int(str(feat.location.start)) b = int(str(feat.location.end)) feat_id = feat_type+'_'+str(counter) seg = {'coords': (a, b), 'strand': feat.strand, 'name': feat_id, 'note': str(a)+'_'+str(b)} self.segs.append(seg)
def mauve_pw_align(ref, query, r_root_dir, g_root_dir, dirs, run, max_size, chop_mode, mauve_exec, mtype): """Set up and perform a pairwise alignment with Mauve.""" aln_dir = r_root_dir + run + dirs['aln_segs'] mauve_dir = r_root_dir + run + dirs['mauve'] # set outputs mauve_outfile = mauve_dir + ref.name + "_" + query.name + ".mauve" segfile = aln_dir + ref.name + "_" + query.name + "_segs.txt" # check for existing alignment if path.exists(segfile): print "already done" else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist files sslist_files = from_dir(g_root_dir, re.compile(r'.*\.sslist.*')) for sslist in sslist_files: try: os.remove(g_root_dir + sslist) except Exception: raise # do Mauve alignment file_list = [ref.gbk, query.gbk] align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype) print "\nSegment results:", len(coords), '->', # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) print len(chop_array), 'segments <', max_size, 'bp' # make detailed pairwise alignments of the segments print "Aligning segments ..." ref_rec = load_genbank(ref.gbk) query_rec = load_genbank(query.gbk) id = iter_align(chop_array, ref_rec, query_rec, aln_dir, segfile) print "Results:", id, "% id. overall" except IOError: print "\nERROR: Mauve alignment failed" raise
def map_cst_aln(run_ref, ref_gbk, genome, scaff_gbk, segs_root, maps_root, segtype, min_size, fct_flags, fct_colors, idpt): """Generate map of construct aligned to reference.""" # set inputs and outputs g_name = genome['name'] ref_ctg_n = run_ref.name seg_file = segs_root + g_name + "/" + g_name + "_" + ref_ctg_n + "_segs.txt" map_file = maps_root + g_name + "_vs_" + ref_ctg_n + ".pdf" # start mapping try: open(scaff_gbk) except IOError: print "WARNING: No scaffold construct to map" else: try: # load segments TODO: add idp-based clumping segdata = np.loadtxt(seg_file, skiprows=1, dtype=segtype) except IOError: msg = "\nERROR: could not load segments data" run_ref.log(msg) print msg except StopIteration: msg = "\nERROR: could not make map" run_ref.log(msg) print msg else: # offset coordinates where desired try: g_offset = genome['offset'] if g_offset[0] != 0 or g_offset[1] != 0: q_len = len(load_genbank(scaff_gbk).seq) segdata = offset_q2r_coords(segdata, q_len, g_offset, segtype) # determine whether to flip the query sequence (negative offset) if g_offset[1] < 0: q_invert = True else: q_invert = False except KeyError: g_offset = (0, 0) q_invert = False # generate graphical map pairwise_draw(ref_ctg_n, g_name, ref_gbk, scaff_gbk, segdata, map_file, q_invert, g_offset, 'dual', 'dual', 'm', 'fct', 'fct', min_size, fct_flags, fct_colors, idpt)
def get_segs_from_feats(self, feat_type): feats = [ feat for feat in load_genbank(self.gbk).features if feat.type == feat_type ] counter = 0 for feat in feats: # TODO: there must be a better way to do this !!! counter += 1 a = int(str(feat.location.start)) b = int(str(feat.location.end)) feat_id = feat_type + '_' + str(counter) seg = { 'coords': (a, b), 'strand': feat.strand, 'name': feat_id, 'note': str(a) + '_' + str(b) } self.segs.append(seg)
def map_cst_aln(run_ref, ref_gbk, genome, scaff_gbk, segs_root, maps_root, segtype, min_size, fct_flags, fct_colors, idpt): """Generate map of construct aligned to reference.""" # set inputs and outputs g_name = genome['name'] ref_ctg_n = run_ref.name seg_file = segs_root+g_name+"/"+g_name+"_"+ref_ctg_n+"_segs.txt" map_file = maps_root+g_name+"_vs_"+ref_ctg_n+".pdf" # start mapping try: open(scaff_gbk) except IOError: print "WARNING: No scaffold construct to map" else: try: # load segments TODO: add idp-based clumping segdata = np.loadtxt(seg_file, skiprows=1, dtype=segtype) except IOError: msg = "\nERROR: could not load segments data" run_ref.log(msg) print msg except StopIteration: msg = "\nERROR: could not make map" run_ref.log(msg) print msg else: # offset coordinates where desired try: g_offset = genome['offset'] if g_offset[0] != 0 or g_offset[1] != 0: q_len = len(load_genbank(scaff_gbk).seq) segdata = offset_q2r_coords(segdata, q_len, g_offset, segtype) # determine whether to flip the query sequence (negative offset) if g_offset[1] < 0: q_invert = True else: q_invert = False except KeyError: g_offset = (0,0) q_invert = False # generate graphical map pairwise_draw(ref_ctg_n, g_name, ref_gbk, scaff_gbk, segdata, map_file, q_invert, g_offset, 'dual', 'dual', 'm', 'fct', 'fct', min_size, fct_flags, fct_colors, idpt)
def contig_draw(cName, in_file, out_file, annot_mode, key, fct_flags, fct_colors): """Draw sequence map of a single contig to file.""" # load contig record seq_record = load_genbank(in_file) ctg_len = len(seq_record.seq) feats = seq_record.features cds = [ feature for feature in feats if feature.type == 'CDS' or feature.type == 'cds' ] if annot_mode == 'all': annot_cds = [len(cds)] else: try: annot_cds = [ 1 for feature in cds if feature.qualifiers.get(key)[0] != 'no match' ] except TypeError: annot_cds = [] annot_cnt = sum(annot_cds) # calculate main canvas dimensions if ctg_len * u < 2000: seq_len = 2000 else: seq_len = ctg_len * u hCan = hmar * 2 + pNsize + seq_len vCan = dBL + vmar * 4 + (annot_cnt / 2) * ck_vsp transX = hmar + pNsize transY = dBL + vmar * 2 + (annot_cnt / 2) * ck_vsp ctg_Y = vmar # set up main canvas canvas = canvasser(hCan, vCan, transX, transY, out_file) # draw contig baseline and features base_draw(canvas, cName, ctg_len, feats, key, -doLdn, ctg_Y, 0, 'single', annot_cnt, None, None, seq_len, annot_mode, fct_flags, fct_colors) # draw scale seq_scale(canvas, (ctg_len * u) - pNsize, incrT, incrN, dip, dop) # write to file and finalize the figure canvas.showPage() canvas.save()
def contig_draw(cName, in_file, out_file, annot_mode, key, fct_flags, fct_colors): """Draw sequence map of a single contig to file.""" # load contig record seq_record = load_genbank(in_file) ctg_len = len(seq_record.seq) feats = seq_record.features cds = [feature for feature in feats if feature.type == 'CDS' or feature.type == 'cds'] if annot_mode == 'all': annot_cds = [len(cds)] else: try: annot_cds = [1 for feature in cds if feature.qualifiers.get(key)[0] != 'no match'] except TypeError: annot_cds = [] annot_cnt = sum(annot_cds) # calculate main canvas dimensions if ctg_len*u < 2000: seq_len = 2000 else: seq_len = ctg_len*u hCan = hmar*2 + pNsize + seq_len vCan = dBL + vmar*4 + (annot_cnt/2)*ck_vsp transX = hmar + pNsize transY = dBL + vmar*2 + (annot_cnt/2)*ck_vsp ctg_Y = vmar # set up main canvas canvas = canvasser(hCan, vCan, transX, transY, out_file) # draw contig baseline and features base_draw(canvas, cName, ctg_len, feats, key, -doLdn, ctg_Y, 0, 'single', annot_cnt, None, None, seq_len, annot_mode, fct_flags, fct_colors) # draw scale seq_scale(canvas, (ctg_len*u)-pNsize, incrT, incrN, dip, dop) # write to file and finalize the figure canvas.showPage() canvas.save()
def map_ref_segs(run_ref, run_id, r_root_dir, run_dirs, min_size, fct_flags, fct_colors, idpt): """Generate map of reference contig with segment details. This provides a comparison of the original reference and the re-annotated version. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ori_file = run_ref.file ref_maps_root = run_root+run_dirs['ref_map_dir'] ensure_dir([ref_maps_root]) gbk_file = run_root+run_dirs['ref_gbk_dir']+ref_n+"_re-annot.gbk" map_file = ref_maps_root+ref_n+"_ref.pdf" # start mapping try: # make mock segment, full-length with 100% id record = load_genbank(gbk_file) length = len(record.seq) segdata = [[1, length, 1, length, 100]] # deactivate offsetting g_offset = (0,0) q_invert = False # generate graphical map pairwise_draw(ref_n+"_ra", ref_n+"_ori", gbk_file, ori_file, segdata, map_file, q_invert, g_offset, 'dual', 'dual', 'm', 'fct', 'product', min_size, fct_flags, fct_colors, idpt) except IOError: msg = "\nERROR: could not load segments data" run_ref.log(msg) print msg except StopIteration: msg = "\nERROR: could not make map" run_ref.log(msg) print msg
def map_ref_segs(run_ref, run_id, r_root_dir, run_dirs, min_size, fct_flags, fct_colors, idpt): """Generate map of reference contig with segment details. This provides a comparison of the original reference and the re-annotated version. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ori_file = run_ref.file ref_maps_root = run_root + run_dirs['ref_map_dir'] ensure_dir([ref_maps_root]) gbk_file = run_root + run_dirs['ref_gbk_dir'] + ref_n + "_re-annot.gbk" map_file = ref_maps_root + ref_n + "_ref.pdf" # start mapping try: # make mock segment, full-length with 100% id record = load_genbank(gbk_file) length = len(record.seq) segdata = [[1, length, 1, length, 100]] # deactivate offsetting g_offset = (0, 0) q_invert = False # generate graphical map pairwise_draw(ref_n + "_ra", ref_n + "_ori", gbk_file, ori_file, segdata, map_file, q_invert, g_offset, 'dual', 'dual', 'm', 'fct', 'product', min_size, fct_flags, fct_colors, idpt) except IOError: msg = "\nERROR: could not load segments data" run_ref.log(msg) print msg except StopIteration: msg = "\nERROR: could not make map" run_ref.log(msg) print msg
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id): """Re-annotate contig and extract reference segments using coordinates.""" # set inputs and outputs run_root = r_root_dir+run_id+"/" ref_name = ref['name'] in_file = fixed_dirs['ori_g_dir']+ref['file'] seg_out_root = run_root+run_dirs['ref_seg_dir']+ref_name+"/" gen_fas_root = fixed_dirs['fas_contigs_dir']+ref_name+"/" if ref_annot_flag: ref_gbk = run_root+run_dirs['ref_gbk_dir']+ref_name+"_re-annot.gbk" else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix ref_gbk = in_file ref_fas = run_root+run_dirs['ref_fas_dir']+ref_name+".fas" genome_fas = gen_fas_root+ref_name+"_1.fas" report_root = run_root+run_dirs['reports']+ref_name+"/" ref_log = report_root+run_id+"_"+ref_name+"_log.txt" ensure_dir([seg_out_root, report_root, gen_fas_root]) print " ", ref_name, "...", # initialize run_ref object run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'], ref['capture'], ref_fas, ref_gbk, seg_out_root, ref_log) # initialize reference log cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"] open(ref_log, 'w').write(" ".join(cl_header)) # open record and ensure we have a fasta in the right place if not path.exists(ref_fas): if run_ref.input == 'fas': copyfile(in_file, ref_fas) elif run_ref.input == 'gbk': record = load_genbank(in_file) record.id = ref_name write_fasta(ref_fas, record) else: msg = "ERROR: Input not recognized for "+ref_name run_ref.log(msg) raise Exception(msg) # make a BLAST DB make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs) copyfile(ref_fas, genome_fas) # re-annotate ref contig if ref_annot_flag: record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs, project_id) else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix record = load_genbank(in_file) # load or generate segment definitions if run_ref.seg_mode == 'chop': run_ref.get_segs_from_chop(len(record.seq), ref['chop_size']) elif run_ref.seg_mode == 'list': run_ref.get_segs_from_list(ref['segs']) elif run_ref.seg_mode == 'feats': run_ref.get_segs_from_feats(ref['feat_type']) # extract segment sequences rec_annot = run_ref.extract_segs_seqs(record, seg_out_root) # write re-annotated reference sequence to file write_genbank(ref_gbk, rec_annot) # report results logstring = " ".join([str(len(run_ref.segs)), "segments"]) print logstring run_ref.log(logstring) return run_ref
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds): """Unpack genome files. Here, unpacking means extracting data and producing specific files to standardize how the information is made available to downstream analysis. Depending on the input file format, different unpacking methods are invoked. In all cases, this ensures that for each genome, there is a multifasta file of the contigs all together as well as a separate Genbank file for each contig. Supported input file formats are the following: - mfas: Basic whole genome sequence in multifasta file of contigs. This can be used to process a finished genome in a single Fasta file as well. - cgbk: All contigs concatenated in a single GenBank file (Genoscope, French WGS). This can be used to process a finished genome in a single GanBank file as well. # TODO: provide support for other possible input formats Unpacking 'cgbk' genomes involves an initial step to detect occurrences of the sequence separator and collect the start and stop coordinates of each contig. Each pair of coordinates can then be used to extract the contig sequence and create a SeqRecord for that contig, which SeqIO normally does when it unpacks multifasta files. """ # set up inputs infile = genome['file'] #TODO: make GUI input loader (upstream) inpath = fixed_dirs['ori_g_dir']+infile g_name = genome['name'] print " ", g_name, "...", # prep output destinations mfas_dir = fixed_dirs['mfas_contigs_dir'] fas_dir = fixed_dirs['fas_contigs_dir']+g_name+"/" ensure_dir([mfas_dir, fas_dir]) mfas_file = mfas_dir+g_name+"_contigs.fas" records = [] # select unpacking method if genome['input'] is 'fas': try: path.exists(inpath) is True except ValueError: raise Exception("Bad input file path") genome_recs = load_multifasta(inpath) # generate GenBank files counter = 0 for rec in genome_recs: counter +=1 ctg_num = str(counter) new_id = g_name+"_"+ctg_num # workaround for long ids new_seq = rec.seq new_seq.alphabet = generic_dna new_rec = SeqRecord(seq=new_seq, id=new_id) records.append(new_rec) # for multifasta output fas_file = fas_dir+new_id+".fas" write_fasta(fas_file, new_rec) elif genome['input'] is 'gbk': # load in genome data genome_rec = load_genbank(inpath) g_string = genome_rec.seq # find split coordinates coord_pairs = multisplit_finder(g_string, separator) # split record counter = 0 for (start, stop) in coord_pairs: counter +=1 ctg_num = str(counter) new_record = genome_rec[start:stop] new_record.id = g_name+"_"+ctg_num records.append(new_record) # for multifasta output fas_file = fas_dir+g_name+"_"+ctg_num+".fas" write_fasta(fas_file, new_record) else: xmsg = "Input file format "+genome['input']+" unspecified/unsupported" raise Exception(xmsg) print counter, "contigs" # write master file write_fasta(mfas_file, records) # pass records to stats logger ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ctgs_root = run_root+run_dirs['run_gbk_ctgs_dir']+ref_n+"/" mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" scaffolds_dir = run_root+run_dirs['scaffolds_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root+g_name+"/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root+g_name+"/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.fas" scaff_gbk = scaffolds_dir+g_name+"_"+ref_n+"_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "("+ctg_num+")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "("+ctg_num+")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir+ctg_num+".mauve" bb_file = mauve_file+".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert(anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir+g_name+"_"+str(ctg_num)+".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name+" scaffold from "+ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype): """Align contigs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ref_ctg_file = run_ref.file mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/" segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/contigs/" q_ctgs_root = run_root + run_dirs['match_out_dir'] + ref_n + "/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs and outputs g_name = genome['name'] ctgs_fas_dir = q_ctgs_root + g_name + "/" mauve_dir = mauve_root + g_name + "/" aln_segs_root = segments_root + g_name + "/" ensure_dir([mauve_dir]) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # list genbank files in matches directory dir_contents = listdir(ctgs_fas_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs q_contig = ctgs_fas_dir + item file_list = (ref_ctg_file, q_contig) mauve_outfile = mauve_dir + ctg_num + ".mauve" aln_segs_dir = aln_segs_root + ctg_num + "/" ensure_dir([aln_segs_dir]) segfile = aln_segs_dir + ctg_num + "_" + ref_n + "_segs.txt" open(segfile, 'w').write('') # do Mauve alignment try: open(ref_ctg_file, 'r') open(q_contig, 'r') except IOError: msg = "\nERROR: File missing, cannot align\n\t\t\t" run_ref.log(msg) print msg else: align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_fasta(q_contig) iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) except IOError: msg = "\nERROR: Mauve alignment failed\n\t\t\t" run_ref.log(msg) print msg except Exception: msg = "\nERROR: Iteration failed\n\t\t\t" run_ref.log(msg) print msg print ""
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, max_size, chop_mode, mtype, mauve_exec): """Align constructs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ref_ctg_file = run_ref.file mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/constructs/" segments_root = run_root + run_dirs['aln_seg_dir'] + ref_n + "/constructs/" scaff_root = run_root + run_dirs['scaffolds_dir'] + ref_n + "/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join( ["\n\n# Align scaffold constructs to reference @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] scaff_gbk = scaff_root + g_name + "_" + ref_n + "_scaffold.gbk" file_list = (ref_ctg_file, scaff_gbk) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root + g_name + "/" aln_segs_dir = segments_root + g_name + "/" ensure_dir([mauve_dir, aln_segs_dir]) mauve_outfile = mauve_dir + g_name + "_" + ref_n + ".mauve" segfile = aln_segs_dir + g_name + "_" + ref_n + "_segs.txt" # abort if the reference file is not found try: open(ref_ctg_file, 'r') except IOError: msg = "ERROR: Reference file not found" print msg run_ref.log(msg) raise # abort if there is no scaffold construct try: open(scaff_gbk, 'r') except IOError: msg = "WARNING: No scaffold construct to align" print msg run_ref.log(msg) else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist file sslist_file = scaff_gbk + ".sslist" if os.path.isfile(sslist_file): try: os.remove(sslist_file) except Exception: raise # do Mauve alignment align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile + ".backbone", 0, mtype) print len(coords), '->', logstring = "".join(["\t", str(len(coords))]) run_ref.log(logstring) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) print len(chop_array), 'segments <', max_size, 'bp', logstring = "".join(["\t", str(len(chop_array))]) run_ref.log(logstring) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_genbank(scaff_gbk) id = iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) print "@", id, "% id. overall" logstring = "".join(["\t", str(id)]) run_ref.log(logstring) except IOError: msg = "\nERROR: Mauve alignment failed" run_ref.log(msg) print msg
def build_scaffolds(run_ref, r_root_dir, run_dirs, prox_D, separator, genomes, run_id, timestamp, mtype, mode): """Build a scaffold of contigs based on the reference. This takes contigs that gave positive hits when blasted with reference segments. The contigs were aligned against the complete reference in a previous step for mapping purposes. Now the output of that step is re-used determine their position. A caveat is that if there are natural local rearrangements in the sequence relative to the reference, they may not be resolved appropriately. The problem is somewhat moderated by the fact that this function takes the best (usually the largest) hit region as "anchor" to position the contig within the scaffold. But if the rearranged region takes up a significant portion of the contig length, the anchoring will probably not be called correctly. Visual inspection of the finalized maps should help diagnose any such problems. The order can be fixed manually using the Mauve Contig Mover, which is part of Mauve 2. Note that not all hit contigs are "real" hits, so filtering should be applied before scaffolding to generate constructs. Model-based filtering produces a list of contigs that will be passed to the scaffolder. If filtering manually by looking at the maps, there are two options available: either select exclusively OR exclude a subset of contigs for the scaffolding process. This is done by listing their ID number in the genome dictionaries in the config file then resuming the pipeline from this step. """ # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" ctgs_root = run_root + run_dirs['run_gbk_ctgs_dir'] + ref_n + "/" mauve_root = run_root + run_dirs['mauve_out_dir'] + ref_n + "/contigs/" scaffolds_dir = run_root + run_dirs['scaffolds_dir'] + ref_n + "/" print " ", ref_n # log logstring = "".join(["\n\n# Build scaffold constructs @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] ctgs_dir = ctgs_root + g_name + "/" print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root + g_name + "/" ensure_dir([mauve_dir, scaffolds_dir]) scaff_fas = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.fas" scaff_gbk = scaffolds_dir + g_name + "_" + ref_n + "_scaffold.gbk" # list genbank files in matches directory dir_contents = listdir(ctgs_dir) anchors_array = np.zeros(1, dtype=[('ctg', 'i4'), ('start', 'i4'), ('end', 'i4'), ('orient', 'i2')]) # identify contigs we want to select subset = [] for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.gbk$') match = pattern.match(item) if match: ctg_num = match.group(1) if mode == "exclude": try: if int(ctg_num) in genome[mode]: msg = "(" + ctg_num + ")" print msg, run_ref.log(msg) else: subset.append(ctg_num) except KeyError: msg = "WARNING: no ignored segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) elif mode == "select": try: if int(ctg_num) in genome[mode]: msg = ctg_num print msg, run_ref.log(msg) subset.append(ctg_num) else: msg = "(" + ctg_num + ")" print msg, run_ref.log(msg) except KeyError: msg = "WARNING: no selected segments list, including all" print msg msg = ctg_num print msg, subset.append(ctg_num) run_ref.log(msg) # at this point we should have a subset of contigs selected for ctg_num in subset: logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs mauve_file = mauve_dir + ctg_num + ".mauve" bb_file = mauve_file + ".backbone" try: # parse Mauve output coords = mauver_load2_k0(bb_file, prox_D, mtype) # determine which segment to use as anchor anchor_seg = get_anchor_loc(coords) anchors_array = np.insert( anchors_array, 0, (ctg_num, anchor_seg['start'], anchor_seg['end'], anchor_seg['orient'])) except IOError: msg = "\tERROR: Mauve alignment not found\n\t" print msg run_ref.log(msg) except Exception: msg = "\tERROR: Iteration failure\n\t" print msg run_ref.log(msg) # abort if there is no valid contig to proceed with try: assert len(anchors_array) > 1 # always 1 left from stub except AssertionError: msg = "\tWARNING: Contig list empty\n\t" print msg run_ref.log(msg) else: # order contigs by anchor location anchors_array = np.sort(anchors_array, order='start') # load contig records from the genbank files in the matches directory ctg_list = [] for ctg_anchor in anchors_array: ctg_num = ctg_anchor['ctg'] if ctg_num > 0: contig_gbk = ctgs_dir + g_name + "_" + str( ctg_num) + ".gbk" record = load_genbank(contig_gbk) if ctg_anchor['orient'] == -1: # flip record record = record.reverse_complement(id=True, name=True, annotations=True, description=True) ctg_list.append(record) else: # workaround for having 0 value leftover from stub pass # having it might come in handy in later dev # output scaffold files write_fasta(scaff_fas, ctg_list) scaff_record = SeqRecord('', id='temp') scaff_bumper = SeqRecord(separator, id='join') for record in ctg_list: feat_start = len(scaff_record.seq) scaff_record += record feat_stop = len(scaff_record.seq) scaff_record += scaff_bumper feat_loc = FeatureLocation(feat_start, feat_stop) pattern = re.compile(r'.*_(\d*)$') match = pattern.match(record.id) try: ctg_num = match.group(1) except Exception: ctg_num = 'N' feature = SeqFeature(location=feat_loc, type='contig', qualifiers={'id': ctg_num}) scaff_record.features.append(feature) scaff_record.description = g_name + " scaffold from " + ref_n try: scaff_record.id = g_name write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper except ValueError: scaff_record.id = g_name[:10] write_genbank(scaff_gbk, scaff_record[:-100]) # rm last bumper print ""
def align_cstrct2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, max_size, chop_mode, mtype, mauve_exec): """Align constructs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ref_ctg_file = run_ref.file mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/constructs/" segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/constructs/" scaff_root = run_root+run_dirs['scaffolds_dir']+ref_n+"/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align scaffold constructs to reference @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs g_name = genome['name'] scaff_gbk = scaff_root+g_name+"_"+ref_n+"_scaffold.gbk" file_list = (ref_ctg_file, scaff_gbk) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # set outputs mauve_dir = mauve_root+g_name+"/" aln_segs_dir = segments_root+g_name+"/" ensure_dir([mauve_dir, aln_segs_dir]) mauve_outfile = mauve_dir+g_name+"_"+ref_n+".mauve" segfile = aln_segs_dir+g_name+"_"+ref_n+"_segs.txt" # abort if the reference file is not found try: open(ref_ctg_file, 'r') except IOError: msg = "ERROR: Reference file not found" print msg run_ref.log(msg) raise # abort if there is no scaffold construct try: open(scaff_gbk, 'r') except IOError: msg = "WARNING: No scaffold construct to align" print msg run_ref.log(msg) else: # prep segments file open(segfile, 'w').write('') # purge any pre-existing sslist file sslist_file = scaff_gbk+".sslist" if os.path.isfile(sslist_file): try: os.remove(sslist_file) except Exception: raise # do Mauve alignment align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype) print len(coords), '->', logstring = "".join(["\t", str(len(coords))]) run_ref.log(logstring) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) print len(chop_array), 'segments <', max_size, 'bp', logstring = "".join(["\t", str(len(chop_array))]) run_ref.log(logstring) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_genbank(scaff_gbk) id = iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) print "@", id, "% id. overall" logstring = "".join(["\t", str(id)]) run_ref.log(logstring) except IOError: msg = "\nERROR: Mauve alignment failed" run_ref.log(msg) print msg
def align_ctg2ref(run_ref, run_id, timestamp, r_root_dir, run_dirs, genomes, mauve_exec, max_size, chop_mode, mtype): """Align contigs pairwise to the reference contig.""" # set inputs and outputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" ref_ctg_file = run_ref.file mauve_root = run_root+run_dirs['mauve_out_dir']+ref_n+"/contigs/" segments_root = run_root+run_dirs['aln_seg_dir']+ref_n+"/contigs/" q_ctgs_root = run_root+run_dirs['match_out_dir']+ref_n+"/" ensure_dir([segments_root]) print " ", ref_n # log logstring = "".join(["\n\n# Align contigs to ref @", timestamp, "\n"]) run_ref.log(logstring) # cycle through genomes for genome in genomes: # set inputs and outputs g_name = genome['name'] ctgs_fas_dir = q_ctgs_root+g_name+"/" mauve_dir = mauve_root+g_name+"/" aln_segs_root = segments_root+g_name+"/" ensure_dir([mauve_dir]) print "\t", g_name, "...", # log logstring = "".join(["\n", g_name]) run_ref.log(logstring) # list genbank files in matches directory dir_contents = listdir(ctgs_fas_dir) for item in dir_contents: pattern = re.compile(r'.*_(\d*)\.fas$') match = pattern.match(item) if match: ctg_num = match.group(1) print ctg_num, logstring = "".join(["\t", ctg_num]) run_ref.log(logstring) # set inputs and outputs q_contig = ctgs_fas_dir+item file_list = (ref_ctg_file, q_contig) mauve_outfile = mauve_dir+ctg_num+".mauve" aln_segs_dir = aln_segs_root+ctg_num+"/" ensure_dir([aln_segs_dir]) segfile = aln_segs_dir+ctg_num+"_"+ref_n+"_segs.txt" open(segfile, 'w').write('') # do Mauve alignment try: open(ref_ctg_file, 'r') open(q_contig, 'r') except IOError: msg = "\nERROR: File missing, cannot align\n\t\t\t" run_ref.log(msg) print msg else: align_mauve(file_list, mauve_outfile, mauve_exec) try: # parse Mauve output (without initial clumping) coords = mauver_load2_k0(mauve_outfile+".backbone", 0, mtype) # chop segments that are too long chop_array = chop_rows(coords, max_size, chop_mode, mtype) # make detailed pairwise alignments of the segments ref_rec = load_genbank(ref_ctg_file) query_rec = load_fasta(q_contig) iter_align(chop_array, ref_rec, query_rec, aln_segs_dir, segfile) except IOError: msg = "\nERROR: Mauve alignment failed\n\t\t\t" run_ref.log(msg) print msg except Exception: msg = "\nERROR: Iteration failed\n\t\t\t" run_ref.log(msg) print msg print ""
def process_ref(ref, ref_annot_flag, r_root_dir, fixed_dirs, run_dirs, run_id, timestamp, prot_db_name, project_id): """Re-annotate contig and extract reference segments using coordinates.""" # set inputs and outputs run_root = r_root_dir + run_id + "/" ref_name = ref['name'] in_file = fixed_dirs['ori_g_dir'] + ref['file'] seg_out_root = run_root + run_dirs['ref_seg_dir'] + ref_name + "/" gen_fas_root = fixed_dirs['fas_contigs_dir'] + ref_name + "/" if ref_annot_flag: ref_gbk = run_root + run_dirs[ 'ref_gbk_dir'] + ref_name + "_re-annot.gbk" else: ## bypass re-annotated ONLY IF ORIGINAL INPUT IS GBK #todo: fix ref_gbk = in_file ref_fas = run_root + run_dirs['ref_fas_dir'] + ref_name + ".fas" genome_fas = gen_fas_root + ref_name + "_1.fas" report_root = run_root + run_dirs['reports'] + ref_name + "/" ref_log = report_root + run_id + "_" + ref_name + "_log.txt" ensure_dir([seg_out_root, report_root, gen_fas_root]) print " ", ref_name, "...", # initialize run_ref object run_ref = Reference(ref_name, in_file, ref['input'], ref['seg_mode'], ref['capture'], ref_fas, ref_gbk, seg_out_root, ref_log) # initialize reference log cl_header = ["# Console log:", run_id, "/", ref_name, timestamp, "\n\n"] open(ref_log, 'w').write(" ".join(cl_header)) # open record and ensure we have a fasta in the right place if not path.exists(ref_fas): if run_ref.input == 'fas': copyfile(in_file, ref_fas) elif run_ref.input == 'gbk': record = load_genbank(in_file) record.id = ref_name write_fasta(ref_fas, record) else: msg = "ERROR: Input not recognized for " + ref_name run_ref.log(msg) raise Exception(msg) # make a BLAST DB make_ref_DB(ref, run_id, fixed_dirs, r_root_dir, run_dirs) copyfile(ref_fas, genome_fas) # re-annotate ref contig if ref_annot_flag: record = annot_ref(ref_name, ref_fas, prot_db_name, fixed_dirs, project_id) else: ## bypass re-annotation ONLY IF ORIGINAL INPUT IS GBK #todo: fix record = load_genbank(in_file) # load or generate segment definitions if run_ref.seg_mode == 'chop': run_ref.get_segs_from_chop(len(record.seq), ref['chop_size']) elif run_ref.seg_mode == 'list': run_ref.get_segs_from_list(ref['segs']) elif run_ref.seg_mode == 'feats': run_ref.get_segs_from_feats(ref['feat_type']) # extract segment sequences rec_annot = run_ref.extract_segs_seqs(record, seg_out_root) # write re-annotated reference sequence to file write_genbank(ref_gbk, rec_annot) # report results logstring = " ".join([str(len(run_ref.segs)), "segments"]) print logstring run_ref.log(logstring) return run_ref
def pairwise_draw(ref, query, segs, map_file, mode1, mode2, annot_mode, key1, key2, idpt, fct_flags, fct_colors, min_size): """Draw pairwise alignment map with similarity shading.""" # load ref record ref_record = load_genbank(ref.gbk) ref_feat = ref_record.features ref_cds = [feature for feature in ref_feat if feature.type == 'CDS' or feature.type == 'cds'] if annot_mode != 'all': try: ref_annot_cds = [1 for cds in ref_cds if cds.qualifiers.get(key1)[0] != 'hypothetical protein' and \ cds.qualifiers.get(key1)[0] != 'no match'] except TypeError: ref_annot_cds = [] ref_annot_cnt = sum(ref_annot_cds) else: ref_annot_cnt = len(ref_cds) # load query record query_record = load_genbank(query.gbk) if query.invert: query_record = query_record.reverse_complement() q_feat = query_record.features query_cds = [feature for feature in q_feat if feature.type == 'CDS' or feature.type == 'cds'] if annot_mode != 'all': try: query_annot_cds = [1 for cds in query_cds if cds.qualifiers.get(key2)[0] != 'hypothetical protein' and \ cds.qualifiers.get(key2)[0] != 'no match'] except TypeError: query_annot_cds = [] query_annot_cnt = sum(query_annot_cds) else: query_annot_cnt = len(query_cds) # calculate main canvas dimensions - horizontal if ref.len+ref.nudge > query.len: ctg_len = ref.len+ref.nudge else: ctg_len = query.len if ctg_len*u < 2000: seq_len = 2000 else: seq_len = ctg_len*u hCan = hmar*2 + pNsize + seq_len # calculate main canvas dimensions - vertical if mode1 == 'single' and mode2 == 'n': annot_cnt = ref_annot_cnt annot_len = annot_cnt/2 else: annot_cnt = max(ref_annot_cnt, query_annot_cnt) annot_len = annot_cnt vCan = dBL + vmar*6 + annot_len*ck_vsp transX = hmar + pNsize transY = dBL + vmar*1.8 + annot_len*ck_vsp ref_Y = vmar*2.8 query_Y = vmar # set up main canvas m_canvas = canvasser(hCan, vCan, transX, transY, map_file) # draw scale seq_scale(m_canvas, (ctg_len*u)-pNsize, 0, incrT, incrN, dip, dop ) # draw shading legend heatkey(m_canvas, -pNsize, -pNsize/2, idpt) # draw ref baseline and features base_draw(m_canvas, ref, ref_feat, key1, doLup, ref_Y, 0, mode1, annot_cnt, seq_len, annot_mode, 'top', fct_flags, fct_colors) # draw query baseline and features base_draw(m_canvas, query, q_feat, key2, -doLdn, query_Y, seq_len/2, mode2, annot_cnt, seq_len, annot_mode, 'low', fct_flags, fct_colors) # draw pairwise similarity shading try: for xa, xb, xc, xd, idp in segs: # evaluate color shading category sh_color = HexColor(simcolor(idp, idpt)) # check for split if abs(xa) > abs(xb) or abs(xc) > abs(xd): new_segpairs = shade_split(xa, xb, xc, xd, ref, query) for xa1, xb1, xc1, xd1 in new_segpairs: # draw shading shadowfax(m_canvas, xa1, xb1, xc1, xd1, ref_Y, query_Y, sh_color, min_size) else: # draw shading shadowfax(m_canvas, xa, xb, xc, xd, ref_Y, query_Y, sh_color, min_size) except TypeError: raise # write to file and finalize the figure m_canvas.showPage() m_canvas.save()
def unpack_genomes(genome, separator, fixed_dirs, ctg_thresholds): """Unpack genome files. Here, unpacking means extracting data and producing specific files to standardize how the information is made available to downstream analysis. Depending on the input file format, different unpacking methods are invoked. In all cases, this ensures that for each genome, there is a multifasta file of the contigs all together as well as a separate Genbank file for each contig. Supported input file formats are the following: - mfas: Basic whole genome sequence in multifasta file of contigs. This can be used to process a finished genome in a single Fasta file as well. - cgbk: All contigs concatenated in a single GenBank file (Genoscope, French WGS). This can be used to process a finished genome in a single GanBank file as well. # TODO: provide support for other possible input formats Unpacking 'cgbk' genomes involves an initial step to detect occurrences of the sequence separator and collect the start and stop coordinates of each contig. Each pair of coordinates can then be used to extract the contig sequence and create a SeqRecord for that contig, which SeqIO normally does when it unpacks multifasta files. """ # set up inputs infile = genome['file'] #TODO: make GUI input loader (upstream) inpath = fixed_dirs['ori_g_dir'] + infile g_name = genome['name'] print " ", g_name, "...", # prep output destinations mfas_dir = fixed_dirs['mfas_contigs_dir'] fas_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/" ensure_dir([mfas_dir, fas_dir]) mfas_file = mfas_dir + g_name + "_contigs.fas" records = [] # select unpacking method if genome['input'] is 'fas': try: path.exists(inpath) is True except ValueError: raise Exception("Bad input file path") genome_recs = load_multifasta(inpath) # generate GenBank files counter = 0 for rec in genome_recs: counter += 1 ctg_num = str(counter) new_id = g_name + "_" + ctg_num # workaround for long ids new_seq = rec.seq new_seq.alphabet = generic_dna new_rec = SeqRecord(seq=new_seq, id=new_id) records.append(new_rec) # for multifasta output fas_file = fas_dir + new_id + ".fas" write_fasta(fas_file, new_rec) elif genome['input'] is 'gbk': # load in genome data genome_rec = load_genbank(inpath) g_string = genome_rec.seq # find split coordinates coord_pairs = multisplit_finder(g_string, separator) # split record counter = 0 for (start, stop) in coord_pairs: counter += 1 ctg_num = str(counter) new_record = genome_rec[start:stop] new_record.id = g_name + "_" + ctg_num records.append(new_record) # for multifasta output fas_file = fas_dir + g_name + "_" + ctg_num + ".fas" write_fasta(fas_file, new_record) else: xmsg = "Input file format " + genome[ 'input'] + " unspecified/unsupported" raise Exception(xmsg) print counter, "contigs" # write master file write_fasta(mfas_file, records) # pass records to stats logger ctg_stats(g_name, fixed_dirs, ctg_thresholds, records)
def multi_draw(g_pairs, segdata_list, mapfile, idpt, fct_flags, fct_colors, min_size): """Draw multiple alignment map with similarity shading.""" print "Lookin\' good!" # compile info lengths = [g_pairs[0][0].len+g_pairs[0][0].nudge] g_to_draw = [g_pairs[0][0]] for ref, query in g_pairs: lengths.append(query.len+query.nudge) g_to_draw.append(query) max_len = max(lengths) # calculate main canvas dimensions - horizontal if max_len*u < 2000: seq_len = 2000 else: seq_len = max_len*u hCan = hmar*4 + pNsize + seq_len # calculate main canvas dimensions - vertical vCan = dBL*len(g_pairs) + vmar*4 transX = hmar + pNsize transY = dBL*len(g_pairs) init_Y = vmar*2 # set up main canvas m_canvas = canvasser(hCan, vCan, transX, transY, mapfile) # draw scale (max_len*u)-pNsize, hmar seq_scale(m_canvas, 2*hCan/3, -vmar*2, incrT, incrN, dip, dop) # draw shading legend heatkey(m_canvas, hCan-hmar*5, init_Y+vmar, idpt) # draw ref baseline and features counter = 0 for genome in g_to_draw: g_record = load_genbank(genome.gbk) g_feat = g_record.features g_cds = [feature for feature in g_feat if feature.type == 'CDS' or feature.type == 'cds'] ref_Y = init_Y-dBL*counter base_draw(m_canvas, genome, g_cds, '', doLup, ref_Y, 0, 'n', 0, seq_len, 'n', 'n', fct_flags, fct_colors) counter +=1 counter = 0 for ref, query in g_pairs: ref_Y = init_Y-dBL*counter query_Y = init_Y-dBL*(counter+1) # draw pairwise similarity shading try: # TODO: adapt Y for xa, xb, xc, xd, idp in segdata_list[counter]: # evaluate color shading category sh_color = HexColor(simcolor(idp, idpt)) # check for split if abs(xa) > abs(xb) or abs(xc) > abs(xd): new_segpairs = shade_split(xa, xb, xc, xd, ref, query) for xa1, xb1, xc1, xd1 in new_segpairs: # draw shading shadowfax(m_canvas, xa1, xb1, xc1, xd1, ref_Y, query_Y, sh_color, min_size) else: # draw shading shadowfax(m_canvas, xa, xb, xc, xd, ref_Y, query_Y, sh_color, min_size) counter +=1 except TypeError: pass # write to file and finalize the figure m_canvas.showPage() m_canvas.save()