def Convert(input, output, filename): """ File format conversion program (fasta, strict-phylip, sequential-phylip, relaxed-phylip and nexus). @parameter input - Input file format. @parameter output - Output file format. @parameter filename - Input filename. """ formDict = { 'fasta': '*.fas', 'nexus': '*.nex', 'phylip': '*.phy', 'phylip-sequential': '*.phy', 'phylip-relaxed': '*.phy' } os.chdir('..') if input == 'fasta' and output == 'nexus': alignment = AlignIO.read(open(filename), "fasta", alphabet=Gapped(IUPAC.protein)) g = open(filename.split(".")[0] + '.nex', 'w') g.write(alignment.format("nexus")); g.close() else: try: handle = open(filename, 'rU'); record = list(SeqIO.parse(handle, input)) fp = open(filename.split('.')[0] + '.' + formDict[output].split('.')[1], 'w') SeqIO.write(record, fp, output); fp.close(); handle.close() except: print("Bad Alignment\n") print("Final output saved in %s" %filename.split('.')[0] + '.' + formDict[output].split('.')[1])
def nexML(self, filename): """ Produces concatenated alignment file in NexML format. """ fp = open('Results.xml', 'w') handleXML = open(filename, 'rU') recordsXML = list(SeqIO.parse(handleXML, "nexus")) SeqIO.write(recordsXML, fp, "seqxml") fp.close() handleXML.close()
def Convert(input, output, filename): """ File format conversion program (fasta, strict-phylip, sequential-phylip, relaxed-phylip and nexus). @parameter input - Input file format. @parameter output - Output file format. @parameter filename - Input filename. """ formDict = { 'fasta': '*.fas', 'nexus': '*.nex', 'phylip': '*.phy', 'phylip-sequential': '*.phy', 'phylip-relaxed': '*.phy' } os.chdir('..') if input == 'fasta' and output == 'nexus': alignment = AlignIO.read(open(filename), "fasta", alphabet=Gapped(IUPAC.protein)) g = open(filename.split(".")[0] + '.nex', 'w') g.write(alignment.format("nexus")) g.close() else: try: handle = open(filename, 'rU') record = list(SeqIO.parse(handle, input)) fp = open( filename.split('.')[0] + '.' + formDict[output].split('.')[1], 'w') SeqIO.write(record, fp, output) fp.close() handle.close() except: print("Bad Alignment\n") print("Final output saved in %s" % filename.split('.')[0] + '.' + formDict[output].split('.')[1])
def _cleanAli(recordNuc, omit, fileName): handleP = open('tAligned.fas', 'rU') records = list(SeqIO.parse(handleP, 'fasta')) store = list() for i, rec in enumerate(records): nucData = [x.seq for x in recordNuc if x.id in rec.id] nucSeqData = _spliter(nucData[0], 3) sequence = Seq("", generic_dna) pos = 0 for j, amino in enumerate(rec.seq): if amino == '-': sequence = sequence + Seq("---", generic_dna) elif amino == 'Z': sequence = sequence + Seq("NNN", generic_dna) pos = pos + 1 else: try: sequence = sequence + nucSeqData[pos] pos = pos + 1 except: if rec.id not in store: store.append(rec.id) records[i].seq = Seq(str(sequence), generic_dna) records = [x for x in records if x.id not in store] if store != []: print("Failed to align following sequences: %s" % store) if omit == False: with open("Input/" + fileName.split('.')[0] + ".nex", 'w') as fp: SeqIO.write(records, fp, "nexus") else: with open("Input/" + fileName.split('.')[0] + "_omited.nex", 'w') as fp: SeqIO.write(records, fp, "nexus") os.remove('translated.fas') os.remove('tAligned.fas')
def RNAfoldConsensus(self): """ Creates RNA structure data from consensus alignment using RNAfold program. Output is stored in RNAConsensus.txt file """ os.chdir("RNAdata") fileList = glob.glob('*.nex') newFileList = [] for name in fileList: file_name = name.split('.')[0] newName = file_name + '.aln' newFileList.append(newName) recordList = self.fileOpenConcNex() os.chdir("..") n = 0 while n < len(fileList): record = recordList[n] file_Write = open(newFileList[n], 'w') SeqIO.write(record, file_Write, "clustal") file_Write.close() n = n + 1 fp = open("RNAConsensus.txt", 'w') for name in newFileList: print("RNA structure | %s| %s" % (time.strftime("%c"), name)) fp.write("[ %s ]\n" % name.split('.')[0]) fp.write( subprocess.check_output("RNAalifold < %s" % name, shell=True)) fp.write('\n\n') fp.close()
def RNAfoldConsensus(self): """ Creates RNA structure data from consensus alignment using RNAfold program. Output is stored in RNAConsensus.txt file """ os.chdir("RNAdata") fileList = glob.glob('*.nex') newFileList = [] for name in fileList: file_name = name.split('.')[0] newName = file_name + '.aln' newFileList.append(newName) recordList = self.fileOpenConcNex() os.chdir("..") n = 0 while n < len(fileList): record = recordList[n] file_Write = open(newFileList[n], 'w') SeqIO.write(record, file_Write, "clustal") file_Write.close() n = n + 1 fp = open("RNAConsensus.txt", 'w') for name in newFileList: print("RNA structure | %s| %s" %(time.strftime("%c"), name)) fp.write("[ %s ]\n" % name.split('.')[0]) fp.write(subprocess.check_output("RNAalifold < %s" % name, shell = True)) fp.write('\n\n') fp.close()
def alignOutput(self, combine): """ alignOutput creates an output file in user defined file format @parameter combine - concatenated alignment matrix """ output_format = self.file_format if output_format == 1: filecompname = "Result1.fasta" file_Write = open(filecompname, 'w') SeqIO.write(combine, file_Write, "fasta") file_Write.close() #This section is for cleaning any unknown description tag from the final fasta file fin = open("Result1.fasta", "r") fout = open("Result.fasta", "w+") input_data = fin.readlines() for line in input_data: if "<unknown description>" in line: line = line.replace("<unknown description>", "") fout.write(line) fin.close() fout.close() elif output_format == 2: file_Write = open("Result.phy", 'w') SeqIO.write(combine, file_Write, "phylip") file_Write.close() elif output_format == 3: file_Write = open("Result.phy", 'w') SeqIO.write(combine, file_Write, "phylip-sequential") file_Write.close() elif output_format == 4: file_Write = open("Result.phy", 'w') SeqIO.write(combine, file_Write, "phylip-relaxed") file_Write.close() else: sys.exit("You have enetered wrong value \n Program Terminated...")
def mrnaAlign(inputFile, pkg, arguments=None): if pkg != 'muscle' and arguments == None: pkg = 'muscle' if pkg == 'muscle': if 'Darwin' in platform.system(): subprocess.call("./src/muscle/muscle -in %s -out %s" % ("Align/" + inputFile, "Input/" + inputFile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) handle = open("Input/" + inputFile, 'rU') record = list(SeqIO.parse(handle, 'fasta')) with open("Input/" + inputFile.split('.')[0] + ".nex", 'w') as fp: SeqIO.write(record, fp, 'nexus') os.remove("Input/" + inputFile) else: subprocess.call("./src/muscle/muscleLinux -in %s -out %s" % ("Align/" + inputFile, "Input/" + inputFile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) handle = open("Input/" + inputFile, 'rU') record = list(SeqIO.parse(handle, 'fasta')) with open("Input/" + inputFile.split('.')[0] + ".nex", 'w') as fp: SeqIO.write(record, fp, 'nexus') os.remove("Input/" + inputFile) else: arguments = arguments.replace('[', '').replace(']', '') subprocess.call( "./src/mafft/mafft.bat %s %s > %s" % (arguments, "Align/" + inputFile, "Input/" + inputFile), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) handle = open("Input/" + inputFile, 'rU') record = list(SeqIO.parse(handle, 'fasta')) with open("Input/" + inputFile.split('.')[0] + ".nex", 'w') as fp: SeqIO.write(record, fp, 'nexus') os.remove("Input/" + inputFile)
def mrnaImport(geneName, group, ortho): """ @ geneName - name of the gene @ group - organism name @ creates a taxon mRNA aligned fasta file as output for the set of genes given as input """ if ortho != None: inpTerm = ortho + "[sym] AND " + group + "[orgn]" elif group != None: inpTerm = geneName + "[sym] AND " + group + "[orgn]" Entrez.email = '*****@*****.**' try: handle = Entrez.esearch(db="gene", term=inpTerm, rettype='xml', RetMax=300) except: raise RuntimeError( "Failed to import sequence from NCBI. Check your internet connection.\nThis might also occur due to NCBI failure" ) records = Entrez.read(handle) idList = records["IdList"] inpTerm = "ortholog_gene_" + str(idList[0]) + "[group]" handle = Entrez.esearch(db="gene", term=inpTerm, rettype='xml', RetMax=300, warning=False) records = Entrez.read(handle) idList = records["IdList"] outRecord = list() for ids in idList: _xmlcreate(ids) refIds = _xmlparser() os.remove('export.xml') recordList = list() for inIDs in refIds: recordList.append(mrnaExt(inIDs)) try: longestRec = recordList[0] except: continue for rec in recordList: longestRec = rec if len(rec.seq) > len( longestRec.seq) else longestRec print("%s" % longestRec.description) outRecord.append(longestRec) with open("Align/" + geneName + '.fas', 'w') as fp: SeqIO.write(outRecord, fp, 'fasta') fdata = open("Align/" + geneName + '.fas', 'r').readlines() with open("Align/" + geneName + '.fas', 'w') as fp: for lines in fdata: if '>' in lines and 'PREDICTED' in lines: newLine = '>' + lines.split(' ')[2] + '_' + lines.split( ' ')[3] + '|' + lines.split(' ')[0].lstrip('>') fp.write('%s\n' % newLine) elif '>' in lines and 'PREDICTED' not in lines: newLine = '>' + lines.split(' ')[1] + '_' + lines.split( ' ')[2] + '|' + lines.split(' ')[0].lstrip('>') fp.write('%s\n' % newLine) else: fp.write('%s' % lines)
def _translator(recordData, ign, omit, table): proteinSeqList = list() recordsFunc = recordData for i, rec in enumerate(recordsFunc): counter = dict() seqT = _translate_str(str(rec.seq), table) if ign == False: if "*" in seqT: counter['one'] = seqT.count('*') seqT = _translate_str( str(rec.seq[1:len(rec.seq)] + Seq("N", generic_dna)), table) if "*" in seqT: counter['two'] = seqT.count('*') seqT = _translate_str( str(rec.seq[2:len(rec.seq)] + Seq("NN", generic_dna)), table) if "*" in seqT: counter['three'] = seqT.count('*') if omit == False: if min(counter, key=counter.get) == 'one': seqT = _translate_str(str(rec.seq), table) elif min(counter, key=counter.get) == 'two': seqT = _translate_str( str(rec.seq[1:len(rec.seq)] + Seq("N", generic_dna)), table) recordsFunc[i].seq = recordsFunc[i].seq[ 1:len(rec.seq)] + Seq("N", generic_dna) elif min(counter, key=counter.get) == 'three': seqT = _translate_str( str(rec.seq[2:len(rec.seq)] + Seq("NN", generic_dna)), table) recordsFunc[i].seq = recordsFunc[i].seq[ 2:len(rec.seq)] + Seq("NN", generic_dna) else: seqT = _translate_str( str(rec.seq[2:len(rec.seq)] + Seq("NN", generic_dna)), table) recordsFunc[ i].seq = recordsFunc[i].seq[2:len(rec.seq)] + Seq( "NN", generic_dna) else: seqT = _translate_str( str(rec.seq[1:len(rec.seq)] + Seq("N", generic_dna)), table) recordsFunc[ i].seq = recordsFunc[i].seq[1:len(rec.seq)] + Seq( "N", generic_dna) else: pass for j, obj in enumerate(seqT): if '*' in obj: seqT = seqT[:j] + 'Z' + seqT[j + 1:] proteinSeqList.append( SeqRecord(Seq(seqT, IUPAC.protein), id=rec.id, name=rec.name, description=rec.description)) with open('translated.fas', 'w') as fp: SeqIO.write(proteinSeqList, fp, 'fasta') return recordsFunc