def Convert(input, output, filename): """ File format conversion program (fasta, strict-phylip, sequential-phylip, relaxed-phylip and nexus). @parameter input - Input file format. @parameter output - Output file format. @parameter filename - Input filename. """ formDict = { 'fasta': '*.fas', 'nexus': '*.nex', 'phylip': '*.phy', 'phylip-sequential': '*.phy', 'phylip-relaxed': '*.phy' } os.chdir('..') if input == 'fasta' and output == 'nexus': alignment = AlignIO.read(open(filename), "fasta", alphabet=Gapped(IUPAC.protein)) g = open(filename.split(".")[0] + '.nex', 'w') g.write(alignment.format("nexus")); g.close() else: try: handle = open(filename, 'rU'); record = list(SeqIO.parse(handle, input)) fp = open(filename.split('.')[0] + '.' + formDict[output].split('.')[1], 'w') SeqIO.write(record, fp, output); fp.close(); handle.close() except: print("Bad Alignment\n") print("Final output saved in %s" %filename.split('.')[0] + '.' + formDict[output].split('.')[1])
def nexML(self, filename): """ Produces concatenated alignment file in NexML format. """ fp = open('Results.xml', 'w') handleXML = open(filename, 'rU') recordsXML = list(SeqIO.parse(handleXML, "nexus")) SeqIO.write(recordsXML, fp, "seqxml") fp.close() handleXML.close()
def fileOpenID(self): """ Quick Record Import. This program creates a list of records for all files. It can handle Nexus record objects. """ file_format = self.file_format extList = ["*.fas", "*.nex", "*.phy", "*.phy", "*.phy"] typeList = [ "fasta", "nexus", "phylip", "phylip-sequential", "phylip-relaxed" ] fileList = glob.glob(extList[file_format - 1]) dict = {} for filename in fileList: handle = open(filename, "rU") idList = [] for record in SeqIO.parse(handle, typeList[file_format - 1]): idList.append(record.id) gene = filename.split(".")[0] dict[gene] = idList handle.close() return dict
def fileOpenID(self): """ Quick Record Import. This program creates a list of records for all files. It can handle Nexus record objects. """ file_format = self.file_format extList = ["*.fas", "*.nex", "*.phy", "*.phy", "*.phy"] typeList = ["fasta", "nexus", "phylip", "phylip-sequential", "phylip-relaxed"] fileList = glob.glob(extList[file_format - 1]) dict = {} for filename in fileList: handle = open(filename, "rU") idList = [] for record in SeqIO.parse(handle, typeList[file_format - 1]): idList.append(record.id) gene = filename.split(".")[0] dict[gene] = idList handle.close() return dict
def mrnaExt(ID): """Extract sequence record for the given sequence ID""" recData = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", warning=False) record = SeqIO.read(recData, 'genbank') return record
def cdsExt(ID, geneName): """ returns sequence record object for the input gene refseq ID """ retdata = Entrez.efetch(db="nucleotide", id=ID, rettype='gb', retmode='text').read() with open("Align/" + geneName.split('.')[0] + ".log", "a") as fp: if 'LOW QUALITY PROTEIN' in retdata: fp.write('%s CDS is of low quality\n' % ID) data = retdata.split('\n') for obj in data: if ' CDS ' in obj: try: cdsRange = [ int(obj.lstrip(' CDS ').split('..')[0]), int(obj.lstrip(' CDS ').split('..')[1]) ] except ValueError: try: cdsRange = [ int( obj.lstrip(' CDS ').split('..')[0]. lstrip('<').lstrip('>').rstrip('<').rstrip('>')), int( obj.lstrip(' CDS ').split('..')[1]. lstrip('<').lstrip('>').rstrip('<').rstrip('>')) ] except ValueError: try: cdsRange = [ int( obj.lstrip(' CDS ').split('..') [0]), int( obj.lstrip(' CDS ').split('..') [1].lstrip('>')) ] except ValueError: print( "Problem found while extracting cds from %s. Please report this issue to ambuj (at) ufl (dot) edu" % obj) continue recData = Entrez.efetch(db="nucleotide", id=ID, rettype="gb", warning=False) record = SeqIO.read(recData, 'genbank') record.seq = record.seq[cdsRange[0] - 1:cdsRange[1]] return record
def Convert(input, output, filename): """ File format conversion program (fasta, strict-phylip, sequential-phylip, relaxed-phylip and nexus). @parameter input - Input file format. @parameter output - Output file format. @parameter filename - Input filename. """ formDict = { 'fasta': '*.fas', 'nexus': '*.nex', 'phylip': '*.phy', 'phylip-sequential': '*.phy', 'phylip-relaxed': '*.phy' } os.chdir('..') if input == 'fasta' and output == 'nexus': alignment = AlignIO.read(open(filename), "fasta", alphabet=Gapped(IUPAC.protein)) g = open(filename.split(".")[0] + '.nex', 'w') g.write(alignment.format("nexus")) g.close() else: try: handle = open(filename, 'rU') record = list(SeqIO.parse(handle, input)) fp = open( filename.split('.')[0] + '.' + formDict[output].split('.')[1], 'w') SeqIO.write(record, fp, output) fp.close() handle.close() except: print("Bad Alignment\n") print("Final output saved in %s" % filename.split('.')[0] + '.' + formDict[output].split('.')[1])
def _cleanAli(recordNuc, omit, fileName): handleP = open('tAligned.fas', 'rU') records = list(SeqIO.parse(handleP, 'fasta')) store = list() for i, rec in enumerate(records): nucData = [x.seq for x in recordNuc if x.id in rec.id] nucSeqData = _spliter(nucData[0], 3) sequence = Seq("", generic_dna) pos = 0 for j, amino in enumerate(rec.seq): if amino == '-': sequence = sequence + Seq("---", generic_dna) elif amino == 'Z': sequence = sequence + Seq("NNN", generic_dna) pos = pos + 1 else: try: sequence = sequence + nucSeqData[pos] pos = pos + 1 except: if rec.id not in store: store.append(rec.id) records[i].seq = Seq(str(sequence), generic_dna) records = [x for x in records if x.id not in store] if store != []: print("Failed to align following sequences: %s" % store) if omit == False: with open("Input/" + fileName.split('.')[0] + ".nex", 'w') as fp: SeqIO.write(records, fp, "nexus") else: with open("Input/" + fileName.split('.')[0] + "_omited.nex", 'w') as fp: SeqIO.write(records, fp, "nexus") os.remove('translated.fas') os.remove('tAligned.fas')
def RNAfoldConsensus(self): """ Creates RNA structure data from consensus alignment using RNAfold program. Output is stored in RNAConsensus.txt file """ os.chdir("RNAdata") fileList = glob.glob('*.nex') newFileList = [] for name in fileList: file_name = name.split('.')[0] newName = file_name + '.aln' newFileList.append(newName) recordList = self.fileOpenConcNex() os.chdir("..") n = 0 while n < len(fileList): record = recordList[n] file_Write = open(newFileList[n], 'w') SeqIO.write(record, file_Write, "clustal") file_Write.close() n = n + 1 fp = open("RNAConsensus.txt", 'w') for name in newFileList: print("RNA structure | %s| %s" % (time.strftime("%c"), name)) fp.write("[ %s ]\n" % name.split('.')[0]) fp.write( subprocess.check_output("RNAalifold < %s" % name, shell=True)) fp.write('\n\n') fp.close()
def RNAfoldConsensus(self): """ Creates RNA structure data from consensus alignment using RNAfold program. Output is stored in RNAConsensus.txt file """ os.chdir("RNAdata") fileList = glob.glob('*.nex') newFileList = [] for name in fileList: file_name = name.split('.')[0] newName = file_name + '.aln' newFileList.append(newName) recordList = self.fileOpenConcNex() os.chdir("..") n = 0 while n < len(fileList): record = recordList[n] file_Write = open(newFileList[n], 'w') SeqIO.write(record, file_Write, "clustal") file_Write.close() n = n + 1 fp = open("RNAConsensus.txt", 'w') for name in newFileList: print("RNA structure | %s| %s" %(time.strftime("%c"), name)) fp.write("[ %s ]\n" % name.split('.')[0]) fp.write(subprocess.check_output("RNAalifold < %s" % name, shell = True)) fp.write('\n\n') fp.close()
def alignOutput(self, combine): """ alignOutput creates an output file in user defined file format @parameter combine - concatenated alignment matrix """ output_format = self.file_format if output_format == 1: filecompname = "Result1.fasta" file_Write = open(filecompname, 'w') SeqIO.write(combine, file_Write, "fasta") file_Write.close() #This section is for cleaning any unknown description tag from the final fasta file fin = open("Result1.fasta", "r") fout = open("Result.fasta", "w+") input_data = fin.readlines() for line in input_data: if "<unknown description>" in line: line = line.replace("<unknown description>", "") fout.write(line) fin.close() fout.close() elif output_format == 2: file_Write = open("Result.phy", 'w') SeqIO.write(combine, file_Write, "phylip") file_Write.close() elif output_format == 3: file_Write = open("Result.phy", 'w') SeqIO.write(combine, file_Write, "phylip-sequential") file_Write.close() elif output_format == 4: file_Write = open("Result.phy", 'w') SeqIO.write(combine, file_Write, "phylip-relaxed") file_Write.close() else: sys.exit("You have enetered wrong value \n Program Terminated...")
def mrnaAlign(inputFile, pkg, arguments=None): if pkg != 'muscle' and arguments == None: pkg = 'muscle' if pkg == 'muscle': if 'Darwin' in platform.system(): subprocess.call("./src/muscle/muscle -in %s -out %s" % ("Align/" + inputFile, "Input/" + inputFile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) handle = open("Input/" + inputFile, 'rU') record = list(SeqIO.parse(handle, 'fasta')) with open("Input/" + inputFile.split('.')[0] + ".nex", 'w') as fp: SeqIO.write(record, fp, 'nexus') os.remove("Input/" + inputFile) else: subprocess.call("./src/muscle/muscleLinux -in %s -out %s" % ("Align/" + inputFile, "Input/" + inputFile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) handle = open("Input/" + inputFile, 'rU') record = list(SeqIO.parse(handle, 'fasta')) with open("Input/" + inputFile.split('.')[0] + ".nex", 'w') as fp: SeqIO.write(record, fp, 'nexus') os.remove("Input/" + inputFile) else: arguments = arguments.replace('[', '').replace(']', '') subprocess.call( "./src/mafft/mafft.bat %s %s > %s" % (arguments, "Align/" + inputFile, "Input/" + inputFile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) handle = open("Input/" + inputFile, 'rU') record = list(SeqIO.parse(handle, 'fasta')) with open("Input/" + inputFile.split('.')[0] + ".nex", 'w') as fp: SeqIO.write(record, fp, 'nexus') os.remove("Input/" + inputFile)
def fileOpenConcNex(self): """ This functions creates a list of alignment records from the files stored in RNAdata directory. Returns - List of alignment records. """ fileList = glob.glob("*.nex") recordList = [] for filename in fileList: handle = open(filename, "rU") record = list(SeqIO.parse(handle, "nexus")) recordList.append(record) return recordList
def cdsAlign(inputFile, pkg='muscle', omit=False, ign=False, CT=None): codonTables = [ 'Ascidian Mitochondrial', 'SGC9', 'Coelenterate Mitochondrial', 'Protozoan Mitochondrial', 'Vertebrate Mitochondrial', 'Plant Plastid', 'Thraustochytrium Mitochondrial', 'Blepharisma Macronuclear', 'Mold Mitochondrial', 'Invertebrate Mitochondrial', 'Standard', 'Trematode Mitochondrial', 'Scenedesmus obliquus Mitochondrial', 'Euplotid Nuclear', 'Yeast Mitochondrial', 'Spiroplasma', 'Alternative Flatworm Mitochondrial', 'Ciliate Nuclear', 'SGC8', 'Alternative Yeast Nuclear', 'Hexamita Nuclear', 'SGC5', 'SGC4', 'SGC3', 'SGC2', 'SGC1', 'SGC0', 'Flatworm Mitochondrial', 'Dasycladacean Nuclear', 'Chlorophycean Mitochondrial', 'Mycoplasma', 'Bacterial', 'Echinoderm Mitochondrial' ] if CT == None: table = CodonTable.ambiguous_dna_by_id[1] elif CT != None and CT in codonTables: table = CodonTable.ambiguous_generic_by_name[CT] else: table = CodonTable.ambiguous_generic_by_name['Standard'] handle = open("Align/" + inputFile, 'rU') records = list(SeqIO.parse(handle, 'fasta')) for j, rec in enumerate(records): if 'TAA' in rec.seq[-3:] or 'TGA' in rec.seq[-3:] or 'TAG' in rec.seq[ -3:]: records[j].seq = rec.seq[0:-3] if omit == True: badQuality = list() fdata = open("Align/" + inputFile.split('.')[0] + '.log', 'r').readlines() for lines in fdata: badQuality.append(lines.split(' ')[0]) newRecords = list() for rec in records: if rec.id.split('|')[1] not in badQuality: newRecords.append(rec) records = newRecords records = _translator(records, ign, omit, table) _alignP(pkg) _cleanAli(records, omit, inputFile)
def fileOpenConc(self): """ This function is used in opening input alignment files for concatenation. It can handle file formats other than Nexus. """ file_format = self.file_format extList = ["*.fas", "*.phy", "*.phy", "*.phy"] typeList = ["fasta", "phylip", "phylip-sequential", "phylip-relaxed"] fileList = glob.glob(extList[file_format - 1]) recordList = [] for filename in fileList: handle = open(filename, "rU") record = list(SeqIO.parse(handle, typeList[file_format - 1])) recordList.append(record) return recordList
def fileOpenConc(self): """ This function is used in opening input alignment files for concatenation. It can handle file formats other than Nexus. """ file_format = self.file_format extList = ["*.fas", "*.phy", "*.phy", "*.phy"] typeList = ["fasta", "phylip", "phylip-sequential", "phylip-relaxed"] fileList = glob.glob(extList[file_format-1]) recordList = [] for filename in fileList: handle = open(filename, "rU") record = list(SeqIO.parse(handle, typeList[file_format - 1])) recordList.append(record) return recordList
def mrnaImport(geneName, group, ortho): """ @ geneName - name of the gene @ group - organism name @ creates a taxon mRNA aligned fasta file as output for the set of genes given as input """ if ortho != None: inpTerm = ortho + "[sym] AND " + group + "[orgn]" elif group != None: inpTerm = geneName + "[sym] AND " + group + "[orgn]" Entrez.email = '*****@*****.**' try: handle = Entrez.esearch(db="gene", term=inpTerm, rettype='xml', RetMax=300) except: raise RuntimeError( "Failed to import sequence from NCBI. Check your internet connection.\nThis might also occur due to NCBI failure" ) records = Entrez.read(handle) idList = records["IdList"] inpTerm = "ortholog_gene_" + str(idList[0]) + "[group]" handle = Entrez.esearch(db="gene", term=inpTerm, rettype='xml', RetMax=300, warning=False) records = Entrez.read(handle) idList = records["IdList"] outRecord = list() for ids in idList: _xmlcreate(ids) refIds = _xmlparser() os.remove('export.xml') recordList = list() for inIDs in refIds: recordList.append(mrnaExt(inIDs)) try: longestRec = recordList[0] except: continue for rec in recordList: longestRec = rec if len(rec.seq) > len( longestRec.seq) else longestRec print("%s" % longestRec.description) outRecord.append(longestRec) with open("Align/" + geneName + '.fas', 'w') as fp: SeqIO.write(outRecord, fp, 'fasta') fdata = open("Align/" + geneName + '.fas', 'r').readlines() with open("Align/" + geneName + '.fas', 'w') as fp: for lines in fdata: if '>' in lines and 'PREDICTED' in lines: newLine = '>' + lines.split(' ')[2] + '_' + lines.split( ' ')[3] + '|' + lines.split(' ')[0].lstrip('>') fp.write('%s\n' % newLine) elif '>' in lines and 'PREDICTED' not in lines: newLine = '>' + lines.split(' ')[1] + '_' + lines.split( ' ')[2] + '|' + lines.split(' ')[0].lstrip('>') fp.write('%s\n' % newLine) else: fp.write('%s' % lines)
def _translator(recordData, ign, omit, table): proteinSeqList = list() recordsFunc = recordData for i, rec in enumerate(recordsFunc): counter = dict() seqT = _translate_str(str(rec.seq), table) if ign == False: if "*" in seqT: counter['one'] = seqT.count('*') seqT = _translate_str( str(rec.seq[1:len(rec.seq)] + Seq("N", generic_dna)), table) if "*" in seqT: counter['two'] = seqT.count('*') seqT = _translate_str( str(rec.seq[2:len(rec.seq)] + Seq("NN", generic_dna)), table) if "*" in seqT: counter['three'] = seqT.count('*') if omit == False: if min(counter, key=counter.get) == 'one': seqT = _translate_str(str(rec.seq), table) elif min(counter, key=counter.get) == 'two': seqT = _translate_str( str(rec.seq[1:len(rec.seq)] + Seq("N", generic_dna)), table) recordsFunc[i].seq = recordsFunc[i].seq[ 1:len(rec.seq)] + Seq("N", generic_dna) elif min(counter, key=counter.get) == 'three': seqT = _translate_str( str(rec.seq[2:len(rec.seq)] + Seq("NN", generic_dna)), table) recordsFunc[i].seq = recordsFunc[i].seq[ 2:len(rec.seq)] + Seq("NN", generic_dna) else: seqT = _translate_str( str(rec.seq[2:len(rec.seq)] + Seq("NN", generic_dna)), table) recordsFunc[ i].seq = recordsFunc[i].seq[2:len(rec.seq)] + Seq( "NN", generic_dna) else: seqT = _translate_str( str(rec.seq[1:len(rec.seq)] + Seq("N", generic_dna)), table) recordsFunc[ i].seq = recordsFunc[i].seq[1:len(rec.seq)] + Seq( "N", generic_dna) else: pass for j, obj in enumerate(seqT): if '*' in obj: seqT = seqT[:j] + 'Z' + seqT[j + 1:] proteinSeqList.append( SeqRecord(Seq(seqT, IUPAC.protein), id=rec.id, name=rec.name, description=rec.description)) with open('translated.fas', 'w') as fp: SeqIO.write(proteinSeqList, fp, 'fasta') return recordsFunc
def RYcoding(self, file, position, msaObject): """ RY-coding program: It replaces A & G to R and C & T to Y either user defined positions or at all the positions. It depends on user selection. @parameter file - @parameter position - user defined position to perform RY coding in alignment matrix @parameter msaObject - Input multiple sequence alignment matri data Return - Multiple sequence alignment object with RY coding """ def ReplaceThird(self, string, position): for i in range(position, len(string), 3): if string[i] == 'A' or string[i] == 'G': string = string[:i-1] + "R" + string[i:] elif string[i] == 'C' or string[i] == 'T': string = string[:i-1] + "Y" + string[i:] return string def ReplaceAll(self, string): for i in range(1, len(string), 1): if string[i] == 'A' or string[i] == 'G': string = string[:i-1] + "R" + string[i:] elif string[i] == 'C' or string[i] == 'T': string = string[:i-1] + "Y" + string[i:] return string handle = open("Results.nex", "rU") records = list(SeqIO.parse(handle, "nexus")) handle.close() msa = msaObject seqlist = [] idlist = [] data = [] x = 0 while x < len(msa): sequence = "" y=0 idlist.append(msa[x].id) while y < len(msa[1]): sequence = sequence + msa[x][y] y = y + 1 seqlist.append(sequence) x = x + 1 newSeqList = [] if position == 'all': for seqData in seqlist: newSeqData = ReplaceAll(self, seqData) newSeqList.append(newSeqData) else: for seqData in seqlist: newSeqData = ReplaceThird(self, seqData, int(position)) newSeqList.append(newSeqData) counter = 0 while counter < len(newSeqList): data.append(SeqRecord(Seq(newSeqList[counter], generic_dna),\ id = records[counter].id, name = records[counter].name,\ description = records[counter].description)) counter = counter + 1 newmsa = MultipleSeqAlignment(data) return newmsa
def RYcoding(self, file, position, msaObject): """ RY-coding program: It replaces A & G to R and C & T to Y either user defined positions or at all the positions. It depends on user selection. @parameter file - @parameter position - user defined position to perform RY coding in alignment matrix @parameter msaObject - Input multiple sequence alignment matri data Return - Multiple sequence alignment object with RY coding """ def ReplaceThird(self, string, position): for i in range(position, len(string), 3): if string[i] == 'A' or string[i] == 'G': string = string[:i - 1] + "R" + string[i:] elif string[i] == 'C' or string[i] == 'T': string = string[:i - 1] + "Y" + string[i:] return string def ReplaceAll(self, string): for i in range(1, len(string), 1): if string[i] == 'A' or string[i] == 'G': string = string[:i - 1] + "R" + string[i:] elif string[i] == 'C' or string[i] == 'T': string = string[:i - 1] + "Y" + string[i:] return string handle = open("Results.nex", "rU") records = list(SeqIO.parse(handle, "nexus")) handle.close() msa = msaObject seqlist = [] idlist = [] data = [] x = 0 while x < len(msa): sequence = "" y = 0 idlist.append(msa[x].id) while y < len(msa[1]): sequence = sequence + msa[x][y] y = y + 1 seqlist.append(sequence) x = x + 1 newSeqList = [] if position == 'all': for seqData in seqlist: newSeqData = ReplaceAll(self, seqData) newSeqList.append(newSeqData) else: for seqData in seqlist: newSeqData = ReplaceThird(self, seqData, int(position)) newSeqList.append(newSeqData) counter = 0 while counter < len(newSeqList): data.append(SeqRecord(Seq(newSeqList[counter], generic_dna),\ id = records[counter].id, name = records[counter].name,\ description = records[counter].description)) counter = counter + 1 newmsa = MultipleSeqAlignment(data) return newmsa