def GetExec(self, optList, frame): # Respond to the "muscle" command. self.frame = frame plugin_exe = r"C:/Program Files (x86)/py27/Lib/site-packages/Muscle.exe" self.outfile = r".\plugins\muscle.txt" self.outtype = "fasta" cline = MuscleCommandline(plugin_exe, out=self.outfile) if '1ProfileCheck' in self.frame.paramBoxes: if self.frame.paramBoxes['1ProfileCheck'].GetValue(): cline.profile = True cline.in1 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" cline.in2 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" else: cline.input = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" if '1DiagCheck' in self.frame.paramBoxes: if self.frame.paramBoxes['1DiagCheck'].GetValue(): cline.diags = True if "DiagLenSpin" in self.frame.paramBoxes: cline.diaglength = int( self.frame.paramBoxes["DiagLenSpin"]) if "DiagMargSpin" in self.frame.paramBoxes: cline.diaglength = int( self.frame.paramBoxes["DiagMargSpin"]) if "DiagBreakSpin" in self.frame.paramBoxes: cline.diaglength = int( self.frame.paramBoxes["DiagBreakSpin"]) elif "GapPenSpin" in self.frame.paramBoxes: cline.gapopen = float( self.frame.paramBoxes["GapPenSpin"].GetValue()) else: cline.input = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" if self.frame.abet == "AA": cline.seqtype = "protein" elif self.frame.abet == "DNA" or self.frame.abet == "RNA": cline.seqtype = "nucleo" else: cline.seqtype = "auto" if self.frame.options: cline.objscore = str(self.boxList[9].GetValue()) cline.weight1 = str(self.boxList[13].GetValue()) cline.weight2 = str(self.boxList[15].GetValue()) cline.anchorspacing = int(self.boxList[17].GetValue()) cline.center = float(self.boxList[19].GetValue()) cline.hydro = int(self.boxList[21].GetValue()) cline.hydrofactor = float(self.boxList[23].GetValue()) cline.maxhours = float(self.boxList[25].GetValue()) cline.maxiters = int(self.boxList[27].GetValue()) cline.maxtrees = int(self.boxList[29].GetValue()) cline.minbestcolscore = float(self.boxList[31].GetValue()) cline.minsmoothscore = float(self.boxList[33].GetValue()) cline.smoothscoreceil = float(self.boxList[35].GetValue()) cline.smoothwindow = int(self.boxList[37].GetValue()) cline.sueff = float(self.boxList[39].GetValue()) return str(cline)
def GetExec(self, optList, frame): # Respond to the "muscle" command. self.frame = frame plugin_exe = r"C:/Program Files (x86)/py27/Lib/site-packages/Muscle.exe" self.outfile=r".\plugins\muscle.txt" self.outtype="fasta" cline = MuscleCommandline(plugin_exe,out=self.outfile) if '1ProfileCheck' in self.frame.paramBoxes: if self.frame.paramBoxes['1ProfileCheck'].GetValue(): cline.profile = True cline.in1 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" cline.in2 = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" else: cline.input = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" if '1DiagCheck' in self.frame.paramBoxes: if self.frame.paramBoxes['1DiagCheck'].GetValue(): cline.diags=True if "DiagLenSpin" in self.frame.paramBoxes: cline.diaglength=int(self.frame.paramBoxes["DiagLenSpin"]) if "DiagMargSpin" in self.frame.paramBoxes: cline.diaglength=int(self.frame.paramBoxes["DiagMargSpin"]) if "DiagBreakSpin" in self.frame.paramBoxes: cline.diaglength=int(self.frame.paramBoxes["DiagBreakSpin"]) elif "GapPenSpin" in self.frame.paramBoxes: cline.gapopen=float(self.frame.paramBoxes["GapPenSpin"].GetValue()) else: cline.input=r"C:\Users\francis\Documents\Monguis\BioGui\plugins\my_seq.fasta" if self.frame.abet=="AA": cline.seqtype="protein" elif self.frame.abet=="DNA" or self.frame.abet=="RNA": cline.seqtype="nucleo" else: cline.seqtype="auto" if self.frame.options: cline.objscore=str(self.boxList[9].GetValue()) cline.weight1=str(self.boxList[13].GetValue()) cline.weight2=str(self.boxList[15].GetValue()) cline.anchorspacing=int(self.boxList[17].GetValue()) cline.center=float(self.boxList[19].GetValue()) cline.hydro=int(self.boxList[21].GetValue()) cline.hydrofactor=float(self.boxList[23].GetValue()) cline.maxhours=float(self.boxList[25].GetValue()) cline.maxiters=int(self.boxList[27].GetValue()) cline.maxtrees=int(self.boxList[29].GetValue()) cline.minbestcolscore=float(self.boxList[31].GetValue()) cline.minsmoothscore=float(self.boxList[33].GetValue()) cline.smoothscoreceil=float(self.boxList[35].GetValue()) cline.smoothwindow=int(self.boxList[37].GetValue()) cline.sueff=float(self.boxList[39].GetValue()) return str(cline)
def quickAlign(refseq, testseq, maxiters=None, diags=None, gapopen=None): #sanity check try: refseq = re.sub("-", "", refseq) except TypeError: #not a string, probably a SeqRecord try: refseq = str(refseq.seq) refseq = re.sub("-", "", refseq) except AttributeError: #give up sys.exit( "quickAlign() requires inputs to be either strings or SeqRecord objects" ) try: testseq = re.sub("-", "", testseq) except TypeError: #not a string, probably a SeqRecord try: testseq = str(testseq.seq) testseq = re.sub("-", "", testseq) except AttributeError: #give up sys.exit( "quickAlign() requires inputs to be either strings or SeqRecord objects" ) handle = StringIO() handle.write(">ref\n%s\n>test\n%s\n" % (refseq, testseq)) data = handle.getvalue() muscle_cline = MuscleCommandline(cmd=muscle, quiet=True) if maxiters is not None: muscle_cline.maxiters = maxiters if diags is not None: muscle_cline.diags = diag if gapopen is not None: muscle_cline.gapopen = gapopen stdout, stderr = muscle_cline(stdin=data) aligned = dict() for p in SeqIO.parse(StringIO(stdout), "fasta"): aligned[p.id] = str(p.seq) return aligned
def quickAlign( refseq, testseq, maxiters=None, diags=None, gapopen=None ): #sanity check refseq = re.sub( "-", "", str(refseq) ) testseq = re.sub( "-", "", str(testseq) ) handle = StringIO() handle.write( ">ref\n%s\n>test\n%s\n"%(refseq,testseq) ) data = handle.getvalue() muscle_cline = MuscleCommandline(cmd=muscle, quiet=True) if maxiters is not None: muscle_cline.maxiters = maxiters if diags is not None: muscle_cline.diags = diag if gapopen is not None: muscle_cline.gapopen = gapopen stdout, stderr = muscle_cline(stdin=data) aligned = dict() for p in SeqIO.parse(StringIO(stdout), "fasta"): aligned[ p.id ] = str(p.seq) return aligned
def buildGSSP(vgene): results = [] if len(masterList[vgene]) < arguments["--numSequences"]: print("Skipping %s, not enough sequences (%d)..." % (vgene, len(masterList[vgene]))) return [] if vgene not in germList: print("Skipping %s, it's not in the germline database..." % vgene) return [] # Take random overlapping subsets to generate multiple profiles # need to add back a sanity check for capping the number of subsets if there's not enough raw data. numProfiles = arguments['--profiles'] if arguments["--profiles"] == 0: numProfiles = 1 success = 0 for i in range(numProfiles): seqs = [] + germList[vgene] #force a copy rather than an alias if arguments["--profiles"] == 0: seqs += list(masterList[vgene]) else: #get our sequence subset, add the germlines, and write them # to a temporary file for alignment seqs += list( numpy.random.choice(masterList[vgene], size=arguments["--numSequences"], replace=False)) tempFile = "%s/work/mGSSP/%s_profileBuilder" % (prj_tree.home, vgene) with open("%s.fa" % tempFile, "w") as temp: SeqIO.write(seqs, temp, "fasta") muscle_cline = MuscleCommandline(cmd=muscle, input="%s.fa" % tempFile, out="%s.aln" % tempFile) #try to speed up the process a little bit for large datasets #still going to max out at ~50k seqs per profile (probably) muscle_cline.maxiters = 2 muscle_cline.diags = True try: stdout, stderr = muscle_cline() except: print("Error in alignment #%d for %s (skipping)" % (i + 1, vgene)) for f in glob.glob("%s.*" % tempFile): os.remove(f) continue alignment = AlignIO.read("%s.aln" % tempFile, "fasta") #"clustal") success += 1 #Input order is not maintained, so we need a little # kludge to find a germline sequences. Use the # first one to remove any insertions from the alignment germRow = 0 for n, rec in enumerate(alignment): if rec.id in [g.id for g in germList[vgene]]: germRow = n break #look for gaps one at a time so we don't get tripped up by shifting indices gap = re.search("-+", str(alignment[germRow].seq)) while (gap): alignment = alignment[:, 0:gap.start()] + alignment[:, gap.end():] gap = re.search("-+", str(alignment[germRow].seq)) #Now we get BioPython to make a PSSM for us. To convert that into # a mutability profile, we will delete the germline residue[s] # at each position (but save what they were) germRes = defaultdict(Counter) summary_align = AlignInfo.SummaryInfo(alignment) pssm = summary_align.pos_specific_score_matrix( chars_to_ignore=['-', 'X']) #get number of datapoints at each position (might be different than the number of sequences in the profile if there are gaps or missing data # do this by using sum(pos.values()) after ignoring missing data (previous line) but before dumping germline residues. denominator = [] for p, pos in enumerate(pssm): denominator.append(sum(pos.values()) - len(germList[vgene])) for germ in germList[vgene]: for pos, residue in enumerate(germ): if residue == "X": continue germRes[pos][residue] += 1 pssm[pos][residue] = 0 #normalize and save for p, pos in enumerate(pssm): germAA = ",".join([x[0] for x in germRes[p].most_common()]) results.append([ vgene, i + 1, p + 1, germAA, "None" if (p < mask[vgene] or denominator[p] < arguments["--numSequences"]) else "%.5f" % (sum(pos.values()) / denominator[p]) ] + [ "%.5f" % (pos.get(r, 0) / sum(pos.values())) if sum(pos.values()) > 0 else "0.00" for r in aa_list ]) #clean up for f in glob.glob("%s.*" % tempFile): os.remove(f) print("Successfully built %d/%d profiles for %s using %d sequences!" % (success, numProfiles, vgene, len(seqs) - len(germList[vgene]))) return results
def buildGSSP( vgene ): results = [] if len(masterList[vgene]) < arguments["--numSequences"]: print( "Skipping %s, not enough sequences (%d)..." % ( vgene, len(masterList[vgene]) ) ) return [] if vgene not in germList: print( "Skipping %s, it's not in the germline database..." %vgene ) return [] # Take random overlapping subsets to generate multiple profiles # need to add back a sanity check for capping the number of subsets if there's not enough raw data. numProfiles = arguments['--profiles'] if arguments["--profiles"] == 0: numProfiles = 1 success = 0 for i in range(numProfiles): seqs = [] + germList[vgene] #force a copy rather than an alias if arguments["--profiles"] == 0: seqs += list(masterList[vgene]) else: #get our sequence subset, add the germlines, and write them # to a temporary file for alignment seqs += list(numpy.random.choice(masterList[vgene], size=arguments["--numSequences"], replace=False)) tempFile = "%s/work/mGSSP/%s_profileBuilder" % (prj_tree.home, vgene) with open("%s.fa"%tempFile, "w") as temp: SeqIO.write(seqs,temp,"fasta") muscle_cline = MuscleCommandline(cmd=muscle, input="%s.fa"%tempFile, out="%s.aln"%tempFile) #try to speed up the process a little bit for large datasets #still going to max out at ~50k seqs per profile (probably) muscle_cline.maxiters = 2 muscle_cline.diags = True try: stdout, stderr = muscle_cline() except: print( "Error in alignment #%d for %s (skipping)" % (i+1, vgene) ) for f in glob.glob("%s.*"%tempFile): os.remove(f) continue alignment = AlignIO.read("%s.aln"%tempFile, "fasta")#"clustal") success += 1 #Input order is not maintained, so we need a little # kludge to find a germline sequences. Use the # first one to remove any insertions from the alignment germRow = 0 for n, rec in enumerate(alignment): if rec.id in [g.id for g in germList[vgene]]: germRow = n break #look for gaps one at a time so we don't get tripped up by shifting indices gap = re.search( "-+", str(alignment[germRow].seq) ) while (gap): alignment = alignment[:, 0:gap.start()] + alignment[:, gap.end():] gap = re.search( "-+", str(alignment[germRow].seq) ) #Now we get BioPython to make a PSSM for us. To convert that into # a mutability profile, we will delete the germline residue[s] # at each position (but save what they were) germRes = defaultdict(Counter) summary_align = AlignInfo.SummaryInfo(alignment) pssm = summary_align.pos_specific_score_matrix(chars_to_ignore=['-','X']) #get number of datapoints at each position (might be different than the number of sequences in the profile if there are gaps or missing data # do this by using sum(pos.values()) after ignoring missing data (previous line) but before dumping germline residues. denominator = [] for p,pos in enumerate(pssm): denominator.append( sum(pos.values()) - len(germList[vgene]) ) for germ in germList[vgene]: for pos, residue in enumerate(germ): if residue == "X": continue germRes[pos][residue] += 1 pssm[pos][residue] = 0 #normalize and save for p, pos in enumerate(pssm): germAA = ",".join([ x[0] for x in germRes[p].most_common() ]) results.append( [ vgene, i+1, p+1, germAA, "None" if (p < mask[vgene] or denominator[p] < arguments["--numSequences"]) else "%.5f"%(sum(pos.values())/denominator[p]) ] + [ "%.5f"%(pos.get(r,0)/sum(pos.values())) if sum(pos.values()) > 0 else "0.00" for r in aa_list ] ) #clean up for f in glob.glob("%s.*"%tempFile): os.remove(f) print( "Successfully built %d/%d profiles for %s using %d sequences!" % ( success, numProfiles, vgene, len(seqs)-len(germList[vgene]) ) ) return results
def main(): global inFile, lookup oldFiles = ( glob.glob("%s/infile" % prj_tree.phylo) + glob.glob("%s/outtree" % prj_tree.phylo) + glob.glob("%s/outfile" % prj_tree.phylo) ) if len(oldFiles) > 0: if force: for f in oldFiles: os.remove(f) else: sys.exit("Old files exist! Please use the -f flag to force overwrite.") if doAlign: # first create a working file to align and add the germline and natives shutil.copyfile( "%s/%s-collected.fa" % (prj_tree.nt, prj_name), "%s/%s_to_align.fa" % (prj_tree.phylo, prj_name) ) handle = open("%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), "a") handle.write(">%s\n%s\n" % (germ_seq.id, germ_seq.seq)) for n in natives.values(): handle.write(">%s\n%s\n" % (n.id, n.seq)) handle.close() # now run muscle run_muscle = MuscleCommandline( input="%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), out="%s/%s_aligned.afa" % (prj_tree.phylo, prj_name) ) run_muscle.maxiters = 2 run_muscle.diags = True run_muscle.gapopen = -5000.0 # code requires a float print run_muscle run_muscle() # thisVarHidesTheOutput = run_muscle() # change inFile variable so that remaining code is the same for both cases # It's probably really bad form to handle this in this way inFile = "%s/%s_aligned.afa" % (prj_tree.phylo, prj_name) # open the alignment to rename everything and find germline sequence # rename is to avoid possible errors with DNAML from sequence ids that are too long germ_pos = 1 with open(inFile, "rU") as handle: if doAlign: aln = AlignIO.read(handle, "fasta") else: try: aln = AlignIO.read(handle, "phylip") except: sys.exit("Please make sure custom input is aligned and in PHYLIP format") lookup = [] for seq in aln: lookup.append(seq.id) if re.search("(IG|VH|VK|VL|HV|KV|LV)", seq.id) is not None: germ_pos = len(lookup) seq.id = "%010d" % len(lookup) with open("%s/infile" % prj_tree.phylo, "w") as output: AlignIO.write(aln, output, "phylip") # now generate script for DNAML # J is "jumble" followed by random seed and number of times to repeat # O is outgroup root, followed by position of the germline in the alignment # 5 tells DNAML to do the ancestor inference # Y starts the run with open("%s/dnaml.in" % prj_tree.phylo, "w") as handle: seed = random.randint(0, 1e10) * 2 + 1 # seed must be odd handle.write("J\n%d\n3\nO\n%d\n5\nY\n" % (seed, germ_pos)) # change to work directory so DNAML finds "infile" and puts the output where we expect os.chdir(prj_tree.phylo) with open("%s/dnaml.in" % prj_tree.phylo, "rU") as pipe: subprocess.call([DNAML], stdin=pipe) # revert names in tree with open("%s/outtree" % prj_tree.phylo, "rU") as intree: mytree = intree.read() fixedtree = re.sub("\d{10}", revertName, mytree) with open("%s/%s.tree" % (prj_tree.out, prj_name), "w") as outtree: outtree.write(fixedtree) # revert names in out file with open("%s/outfile" % prj_tree.phylo, "rU") as instuff: mystuff = instuff.read() fixedstuff = re.sub("\d{10}", revertName, mystuff) with open("%s/%s.dnaml.out" % (prj_tree.logs, prj_name), "w") as outstuff: outstuff.write(fixedstuff) # clean up os.remove("infile") os.remove("outfile") os.remove("outtree")
def main(): oldFiles = glob.glob("%s/infile" % prj_tree.phylo) + glob.glob( "%s/%s_igphyml.tree" % (prj_tree.out, prj_name)) + glob.glob("%s/%s_igphyml_stats.txt" % (prj_tree.logs, prj_name)) if len(oldFiles) > 0: if arguments['-f']: for f in oldFiles: os.remove(f) else: sys.exit( "Old files exist! Please use the -f flag to force overwrite.") if arguments['-v'] is not None: #do alignment #first create a working file to align and add the germline and natives shutil.copyfile(arguments['--seqs'], "%s/%s_to_align.fa" % (prj_tree.phylo, prj_name)) handle = open("%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), "a") handle.write("\n>%s\n%s\n" % (germ_seq.id, germ_seq.seq)) for n in natives.values(): handle.write(">%s\n%s\n" % (n.id, n.seq)) handle.close() #now run muscle run_muscle = MuscleCommandline( cmd=muscle, input="%s/%s_to_align.fa" % (prj_tree.phylo, prj_name), out="%s/%s_aligned.afa" % (prj_tree.phylo, prj_name)) run_muscle.maxiters = 2 run_muscle.diags = True run_muscle.gapopen = -5000.0 #code requires a float print(run_muscle) run_muscle() #this is probably bad form arguments['-i'] = "%s/%s_aligned.afa" % (prj_tree.phylo, prj_name) #open the alignment to rename everything and find germline sequence with open(arguments['-i'], "r") as handle: try: aln = AlignIO.read(handle, arguments['--format']) except: sys.exit("Couldn't read alignment: is %s the correct format?" % arguments['--format']) align_len = aln.get_alignment_length() extra = align_len % 3 if extra > 0: print("Trimming alignment to even codon length...", file=sys.stderr) aln = aln[:, 0:-extra] align_len -= extra #kill the fasta def line and any usearch/vsearch annotations to avoid formatting foul-ups germ_id = "" foundRoot = False gaps = defaultdict(list) for seq in aln: seq.id = re.sub("[;:].*", "", seq.id) seq.description = "" if re.search("(IG|VH|VK|VL|HV|KV|LV)", seq.id, re.I) is not None: germ_id = seq.id if arguments['--root'] is not None and seq.id == arguments['--root']: foundRoot = True for g in re.finditer("-+", str(seq.seq)): #save gap. value is a field to help me determine what's real in assignGaps gaps[seq.id.upper()].append({ 'start': g.start(), 'end': g.end(), 'value': 1 }) if arguments['--root'] is not None: germ_id = arguments['--root'] if not foundRoot: sys.exit("Couldn't find specified root sequence %s in input file" % arguments['--root']) elif germ_id == "": sys.exit( "Couldn't find a germline gene in the alignment, please use the --root option and try again." ) with open("%s/infile" % prj_tree.phylo, "w") as output: AlignIO.write(aln, output, "fasta") #now call IgPhyML #fast initial tree opts = ["--threads", arguments['--threads']] if not arguments['--quick']: opts += ["-s", "SPR"] if arguments['--seed'] is not None: opts += ["--r_seed", arguments['--seed']] #set an environmental variable so that IgPhyML can find its libraries os.environ.update( {'IGPHYML_PATH': '%s/third-party/src/motifs' % SCRIPT_FOLDER}) s = subprocess.Popen([ igphyml, "-i", "%s/infile" % prj_tree.phylo, "-m", "GY", "-w", "MO", "-t", "e", "--run_id", "gy94" ] + opts, universal_newlines=True, stderr=subprocess.PIPE) o, e = s.communicate() if re.search("error while loading shared libraries", str(e)): #Some libraries needed for optimized execution are missing # Try again with a version compiled without optimizations s = subprocess.Popen( [ igphyml_slow, "-i", "%s/infile" % prj_tree.phylo, "-m", "GY", "-w", "MO", "-t", "e", "--run_id", "gy94" ] + opts[2:], #no threading option available universal_newlines=True, stderr=subprocess.PIPE) o, e = s.communicate() if e != "" or s.returncode != 0: sys.exit("Error running '%s':\n%sExit code %d" % (" ".join(s.args), e, s.returncode)) #Refine tree with AID-specific hotpsot motifs opts = ["--threads", arguments['--threads']] if arguments['--quick']: opts += ['-o', 'lr'] else: opts += ['-o', 'tlr'] if arguments['--seed'] is not None: opts += ["--r_seed", arguments['--seed']] s = subprocess.Popen([ igphyml, "-i", "%s/infile" % prj_tree.phylo, "-m", "HLP17", "--root", germ_id, "-u", "%s/infile_igphyml_tree.txt_gy94" % prj_tree.phylo, "--motifs", "FCH", "--run_id", "hlp17", "--ambigfile", "%s/ambigfile.txt" % prj_tree.phylo ] + opts, universal_newlines=True, stderr=subprocess.PIPE) o, e = s.communicate() if re.search("error while loading shared libraries", str(e)): s = subprocess.Popen( [ igphyml_slow, "-i", "%s/infile" % prj_tree.phylo, "-m", "HLP17", "--root", germ_id, "-u", "%s/infile_igphyml_tree.txt_gy94" % prj_tree.phylo, "--motifs", "FCH", "--run_id", "hlp17", "--ambigfile", "%s/ambigfile.txt" % prj_tree.phylo ] + opts[2:], #no threading option available universal_newlines=True, stderr=subprocess.PIPE) o, e = s.communicate() if e != "" or s.returncode != 0: sys.exit("Error running '%s':\n%sExit code %d" % (" ".join(s.args), e, s.returncode)) if not arguments['--noAnc']: #now need to set up a config file for ancestor reconstruction with open("%s/ar.config" % prj_tree.phylo, "w") as handle: handle.write("length\t%d\n" % (align_len / 3)) handle.write("rooted\t1\noutdir\t%s\n" % prj_tree.phylo) handle.write("seqfile\t%s/infile\n" % prj_tree.phylo) handle.write("rootid\t%s\n" % germ_id) handle.write("igphyml\t%s/%s\n" % (SCRIPT_FOLDER, "third-party")) handle.write("stats\t%s/infile_igphyml_stats.txt_hlp17\n" % prj_tree.phylo) handle.write("tree\t%s/infile_igphyml_tree.txt_hlp17\n" % prj_tree.phylo) handle.write("ambigfile\t%s/ambigfile.txt\n" % prj_tree.phylo) handle.write("stem\t%s\n" % prj_name) s = subprocess.Popen([ "perl", "-I", "%s/third-party" % SCRIPT_FOLDER, reconstruct, "%s/ar.config" % prj_tree.phylo ], universal_newlines=True, stderr=subprocess.PIPE) o, e = s.communicate() if e != "" or s.returncode != 0: sys.exit("Error running '%s':\n%sExit code %d" % (" ".join(s.args), e, s.returncode)) if len(gaps) > 0: #fix ancestor inference by putting gaps back in #start by reading in inferred sequences and reconstructing the tree tree = dict() stack = list() seqDict = OrderedDict() with open("%s/%s.MLcodons.fa" % (prj_tree.phylo, prj_name), "r") as infer: for seq in SeqIO.parse(infer, "fasta"): name = seq.id.split(";")[1] seqDict[name] = seq if "," in name: tree[name] = {'id': name, 'children': stack[-2:]} tree[stack.pop()]['parent'] = name tree[stack.pop()]['parent'] = name stack.append(name) else: tree[name] = {'id': name, 'children': []} stack.append(name) #now iterate down tree to propogate gaps assignGaps(stack[0], tree, gaps) #do output with open("%s/%s_inferredAncestors.fa" % (prj_tree.nt, prj_name), "w") as handle: SeqIO.write(getFinalSeqs(seqDict, gaps), handle, "fasta") with open("%s/%s_inferredAncestors.fa" % (prj_tree.aa, prj_name), "w") as handle: SeqIO.write(getFinalSeqs(seqDict, gaps, trans=True), handle, "fasta") else: os.rename("%s/%s.MLcodons.fa" % (prj_tree.phylo, prj_name), "%s/%s_inferredAncestors.fa" % (prj_tree.nt, prj_name)) os.rename("%s/%s.MLaas.fa" % (prj_tree.phylo, prj_name), "%s/%s_inferredAncestors.fa" % (prj_tree.aa, prj_name)) #move non-seqeunce outputs to logical places os.rename("%s/infile_igphyml_stats.txt_hlp17" % prj_tree.phylo, "%s/%s_igphyml_stats.txt" % (prj_tree.logs, prj_name)) os.rename("%s/infile_igphyml_tree.txt_hlp17" % prj_tree.phylo, "%s/%s_igphyml.tree" % (prj_tree.out, prj_name))
def main(): global inFile, lookup, workDir, outTreeFile, outFile, seqFile oldFiles = glob.glob("%s/infile"%workDir) + glob.glob("%s/outtree"%workDir) + glob.glob("%s/outfile"%workDir) if len(oldFiles) > 0: if force: for f in oldFiles: os.remove(f) else: sys.exit("Old files exist! Please use the -f flag to force overwrite.") if doAlign: #first create a working file to align and add the germline and natives shutil.copyfile(seqFile, "%s/%s_to_align.fa"%(workDir, prj_name)) handle = open( "%s/%s_to_align.fa"%(workDir, prj_name), "a" ) handle.write( ">%s\n%s\n" % (germ_seq.id, germ_seq.seq) ) for n in natives.values(): handle.write( ">%s\n%s\n" % (n.id, n.seq) ) handle.close() #now run muscle run_muscle = MuscleCommandline( input="%s/%s_to_align.fa" % (workDir, prj_name), out="%s/%s_aligned.afa" % (prj_tree.phylo, prj_name) ) run_muscle.maxiters = 2 run_muscle.diags = True run_muscle.gapopen = -5000.0 #code requires a float print run_muscle run_muscle() inFile = "%s/%s_aligned.afa" % (workDir, prj_name) #open the alignment to rename everything and find germline sequence #rename is to avoid possible errors with DNAML from sequence ids that are too long germ_pos = 1 with open(inFile, "rU") as handle: if doAlign: aln = AlignIO.read(handle, "fasta") else: try: aln = AlignIO.read(handle, "phylip-relaxed") except: sys.exit("Please make sure custom input is aligned and in PHYLIP format...") lookup = [] for seq in aln: lookup.append( seq.id ) if re.search("(IG|VH|VK|VL|HV|KV|LV)", seq.id) is not None: germ_pos = len( lookup ) seq.id = "%010d" % len( lookup ) with open("%s/infile" % workDir, "w") as output: AlignIO.write(aln, output, "phylip") #now generate script for DNAML # J is "jumble" followed by random seed and number of times to repeat # O is outgroup root, followed by position of the germline in the alignment # 5 tells DNAML to do the ancestor inference # Y starts the run with open("%s/dnaml.in"%workDir, "w") as handle: seed = random.randint(0,1e10) * 2 + 1 #seed must be odd handle.write("J\n%d\n5\nG\nO\n%d\n5\nY\n" % (seed, germ_pos)) # change to work directory so DNAML finds "infile" and puts the output where we expect origWD = os.getcwd() os.chdir(workDir) with open("dnaml.in", "rU") as pipe: subprocess.call([dnaml], stdin=pipe) os.chdir(origWD) #revert names in tree with open("%s/outtree"%workDir, "rU") as intree: mytree = intree.read() fixedtree = re.sub("\d{10}", revertName, mytree) with open(outTreeFile, "w") as outtree: outtree.write(fixedtree) #revert names in out file with open("%s/outfile"%workDir, "rU") as instuff: mystuff = instuff.read() fixedstuff = re.sub("\d{10}", revertName, mystuff) with open(outFile, "w") as outstuff: outstuff.write(fixedstuff) print "\nOutput in %s and %s\n" % (outTreeFile, outFile)