def findUnique(): fastaReader = sequenceAnalysis.FastAreader() uniqueFinder = UniqueFinder() organizedFile = [] for header, sequence in fastaReader.readFasta(): organizedFile.append((header[8:11], header, sequence)) uniqueFinder.addSequence(sequence) uniqueFinder.getPowerSet() organizedFile.sort() for aa, header, sequence in organizedFile: essentials = [] uniqueFinder.addSequence(sequence) print(header) print(uniqueFinder.sequence) for essential in uniqueFinder.getEssentials(): match = allindices(uniqueFinder.sequence, essential) for pos in match: tup = (essential, pos) essentials.append(tup) essentials.sort(key=lambda tup: tup[1]) for sub in essentials: print("{0}{1}".format(sub[1] * ".", sub[0]))
def main(): # make sure to change this to use stdin myReader = sequenceAnalysis.FastAreader() myNuc = sequenceAnalysis.NucParams() for head, seq in myReader.readFasta(): myNuc.addSequence(seq) # print sequence length print("\nsequence length: {:.2f} Mb\n".format(myNuc.nucCount() / (10**6))) # find gc content nucComp = myNuc.nucComposition() nucCount = myNuc.nucCount() GC = nucComp.get('G') + nucComp.get('C') # print gc content print("GC content: {:.1f}%\n".format((GC / nucCount) * 100)) # sort codons in alpha order, by Amino Acid sortedCodons = sorted(myNuc.rnaCodonTable.items(), \ key = operator.itemgetter(1, 0)) # calculate relative codon usage for each codon and print codonComp = myNuc.codonComposition() AAcomp = myNuc.aaComposition() for codonAApair in sortedCodons: nuc = codonAApair[0] aa = codonAApair[1] val = codonComp.get(nuc) / AAcomp.get(aa) print('{:s} : {:s} {:5.1f} ({:6d})'.format(nuc, aa, val * 100, codonComp.get(nuc))) print('\r')
def main(myCommandLine=None): ''' Implements the Usage exception handler that can be raised from anywhere in process. ''' if myCommandLine is None: myCommandLine = CommandLine([ 'vulgaris.fasta', 'vulgarisORFdata-ATG-100.txt', '--longestGene', '--start=ATG', '--minGene=100' ]) else: myCommandLine = CommandLine(myCommandLine) ###### replace the code between comments. # myCommandLine.args.inFile has the input file name myCommandLine.args.inFile = 'vulgaris.fasta' # myCommandLine.args.outFile has the output file name myCommandLine.args.outFile = 'vulgarisORFdata-ATG-100.txt' # myCommandLine.args.longestGene is True if only the longest Gene is desired # myCommandLine.args.start is a list of start codons myCommandLine.args.start = ['ATG', 'GTG', 'CTG', 'TTG'] # myCommandLine.args.minGene is the minimum Gene length to include myCommandLine.args.minGene = 100 ###### import sequenceAnalysis as x #importing sequenceAnalysis module with the new name 'x' #taking the fastafile (tass2.fa) through FastAreader tass = x.FastAreader(myCommandLine.args.inFile) output = myCommandLine.args.outFile #define the output file # making the output and preparing to write on it: with open(output, 'w') as ORFData: #getting the header from the file by using readFasta() method from the FastAreader class. for head, seq in tass.readFasta(): print( head, file=ORFData ) #note for the print statement to write (,file = the name) is essential orf = x.ORF( ) # running the object through ORF class, initializing the lists #using the ORFfinder method in the class to get the list of the lists of the ORFs geneData = orf.ORFfinder(seq) #let's combine all the 6 lists together dataList = geneData[0] + geneData[1] + geneData[2] + geneData[ 3] + geneData[4] + geneData[5] #sort the data dataList.sort(key=lambda length: length[3], reverse=True) #order the genes by frame,startposition, endposition, and their length for genes in dataList: frame = genes[0] #frame is the first list start = genes[1] #start is the 2nd list stop = genes[2] #stop is the 3rd list length = genes[3] #length is the 4th list #print statement, note (, file = theName) is essential to write on outputfile print('{:+d} {:>5d}..{:>5d} {:>5d}'.format( frame, start, stop, length), file=ORFData) if myCommandLine.args.longestGene: print('longestGene is', str(myCommandLine.args.longestGene)) else: pass
def __init__(self): """ Instantiates two objects. One of FastAreader class and one of NucParams. FastAreader object reads fasta file and NucParams does the counting of the sequences inside fasta file. """ self.myReader = SA.FastAreader() self.thisAnalyzer = SA.NucParams()
def main(): # make sure to change this to use stdin myReader1 = sequenceAnalysis.FastAreader('testGenome.fa') myNuc1 = sequenceAnalysis.NucParams() for head, seq in myReader1.readFasta(): myNuc1.addSequence(seq) # make sure to change this to use stdin myReader2 = sequenceAnalysis.FastAreader('haloVolc1_1-genes.fa') myNuc2 = sequenceAnalysis.NucParams() for head, seq in myReader2.readFasta(): myNuc2.addSequence(seq) # find gc content nucComp1 = myNuc1.nucComposition() nucComp2 = myNuc2.nucComposition() nucCount1 = myNuc1.nucCount() nucCount2 = myNuc2.nucCount() GC1 = nucComp1.get('G') + nucComp2.get('C') GC2 = nucComp2.get('G') + nucComp2.get('C') log2 = math.log(((GC1/nucCount1) * 100) / \ ((GC2/nucCount2) * 100), 2) # print gc content print("GC content (log2(GC1/GC2)): {:.2f}\n".format(log2)) # sort codons in alpha order, by Amino Acid sortedCodons = sorted(myNuc1.rnaCodonTable.items(), \ key = operator.itemgetter(1, 0)) # calculate relative codon usage & frequency per codon and print print("codon : amino log2(usage1/usage2) (log2(count1/count2))") codonComp1 = myNuc1.codonComposition() codonComp2 = myNuc2.codonComposition() AAcomp1 = myNuc1.aaComposition() AAcomp2 = myNuc2.aaComposition() for codonAApair in sortedCodons: nuc = codonAApair[0] aa = codonAApair[1] relativeComp = math.log( (codonComp1.get(nuc)/ \ AAcomp1.get(aa) / \ (codonComp2.get(nuc)/ \ AAcomp2.get(aa))), 2) count = math.log((codonComp1.get(nuc) / \ codonComp2.get(nuc)), 2) print('{:s} : {:s} {:5.2f} ({:6.2f})'.format( nuc, aa, relativeComp * 100, count)) print('\r')
def analyze(file): fastaReader = sequenceAnalysis.FastAreader(file) analyzer = sequenceAnalysis.NucParams() for header, sequence in fastaReader.readFasta(): analyzer.addSequence(sequence) '''get all the dictionaries we need for analysis''' nucComp = analyzer.nucComposition() codonComp = analyzer.codonComposition() aaComp = analyzer.aaComposition() rnaCodonDict = sorted(analyzer.rnaCodonTable.items()) return nucComp, codonComp, aaComp, rnaCodonDict
def main(myCommandLine=None): """ Implements the Usage exception handler that can be raised from anywhere in process. """ # myCommandLine.args.inFile has the input file name # myCommandLine.args.outFile has the output file name # myCommandLine.args.longestGene is True if only the longest Gene is desired if myCommandLine is None: myCommandLine = CommandLine([ 'tass2.fa', 'tass2ORFdata-ATG-100.txt', '--longestGene', '--start=ATG', '--minGene=100' ]) else: myCommandLine = CommandLine(myCommandLine) # myCommandLine.args.startList is a list of start codons # myCommandLine.args.minGene is the minimum Gene length to include # Parse the options myCommandLine.args.inFile outFile = myCommandLine.args.outFile myCommandLine.args.longestGene myCommandLine.args.start myCommandLine.args.minGene # Parse inFile and initialize it to be read orfReader = sequenceAnalysis.FastAreader(myCommandLine.args.inFile) open(myCommandLine.args.outFile, 'w').close() f = open(outFile, 'a') # loop through sequences in the file # find ORFs and write them to file for head, seq in orfReader.readFasta(): nucParams = sequenceAnalysis.NucParams('') nucParams.addSequence(seq) nucParams.initializeObjects() bases = list((''.join(nucParams.codonList))) reverseBases = getReverseComplement(bases) finder = OrfFinder(bases, head, minGene=myCommandLine.args.minGene, longestGene=myCommandLine.args.longestGene, start=myCommandLine.args.start, revSeq=reverseBases) finder.findOrfs(f) finder.findRevOrfs(f) finder.writeFramesToFile(f)
def __init__(self): """ Takes in multiple sequences of tRNA from a FASTA file and finds power set of each tRNA sequence, then appends each power set to a list. Attributes: attr1 (list): List of power sets for each tRNA. attr2 (list): List of unique sets for each tRNA. attr3 (dict): Dictionary with key: count (0-22) & value: [tRNA header, tRNA sequence] """ self.powerSetList = [] # powerSetList contains all 22 power sets for each tRNA. self.uniqueList = [] # Used to save 22 unique substring sets for each tRNA. self.headerSequenceDictionary = {} # Dictionary to save the header and sequence of each tRNA as a list. fastaFile = SA.FastAreader() # Instantiate object of FastAreader class so we can read in file. count = 0 for header, sequence in fastaFile.readFasta(): # Reads FastA file and yields the tRNA header/sequence. filteredSequence = self.removeCharacters(sequence) # Removes dashes and underscores from sequence. self.headerSequenceDictionary[count] = [header, filteredSequence] # Initialize dictionary values with list = [header, squence]. mypowerSet = self.powerSet(filteredSequence) # Finds the power set for each tRNA sequence. self.powerSetList.append(mypowerSet) # Add power set for each tRNA to a list. count += 1
def __init__(self): """ Reads tRNA from the inputted Fasta file. Adds the power set of each sequence to a list """ self.powerSetList = [] self.uniqueList = [] # List fo tRNA substrings self.headerSequenceDictionary = {} # sequence saving dict. fastaFile = seqAnal.FastAreader( ) # pulls in fasta reader and initializes here to read in fasta file count = 0 for header, sequence in fastaFile.readFasta(): filteredSequence = self.removeChar( sequence) # makes sure only valid characters are being used self.headerSequenceDictionary[count] = [header, filteredSequence] mypowerSet = self.powerSet(filteredSequence) self.powerSetList.append( mypowerSet ) # Add power set (after finding) for each tRNA to a list. count += 1
def main(myCommandLine=None): """Reads in a fasta file and outputs the ORFs frame, start, stop, and length position on a output file.""" if myCommandLine is None: myCommandLine = CommandLine() if myCommandLine.args.longestGene: # If the terminal sees lG flag variable start this part of code. fastaFile = sequenceAnalysis.FastAreader() for header, sequence in fastaFile.readFasta(): print(header) orfData = OrfFinder(sequence) orfData.findOrfs() orfData.findRevOrfs() filteredList = filter( lambda orf: orf[3] > myCommandLine.args.minGene, orfData. orfs) # Filters out the ORFS depending on the minGene arg. for frame, start, stop, length in sorted( filteredList, key=lambda orf: orf[3], reverse=True ): # Unzips filteredList and sorts the list by length. print('{:+d} {:>5d}..{:>5d} {:>5d}'.format( frame, start, stop, length)) else: myCommandLine = CommandLine(myCommandLine)
def main(inCL=None): ''' Find some genes. ''' # load FastAreader import sequenceAnalysis from operator import itemgetter ORFreader = sequenceAnalysis.FastAreader() if inCL is None: myCommandLine = CommandLine() else: myCommandLine = CommandLine(inCL) # locate ORFs in sequences for head, seq in ORFreader.readFasta(): foundORFs = sequenceAnalysis.ORFfinder(seq, myCommandLine.args.start, myCommandLine.args.stop) foundORFs.finder() foundORFs.ORFlist.sort(key=lambda x: (x[3], -x[1]), reverse=True) print(head) indexList = [] for frame in foundORFs.ORFlist: if myCommandLine.args.longestGene: # find longest ORF in a presorted ORF list if frame[0] < 0: if frame[1] not in indexList and frame[ 3] >= myCommandLine.args.minGene: indexList.append(frame[1]) print('{:+d} {:>5d}..{:>5d} {:>5d}'.format( frame[0], frame[1], frame[2], frame[3])) else: if frame[2] not in indexList and frame[ 3] >= myCommandLine.args.minGene: indexList.append(frame[2]) print('{:+d} {:>5d}..{:>5d} {:>5d}'.format( frame[0], frame[1], frame[2], frame[3])) else: if frame[3] >= myCommandLine.args.minGene: print('{:+d} {:>5d}..{:>5d} {:>5d}'.format( frame[0], frame[1], frame[2], frame[3]))
def output(infile, outfile): fastaReader = sequenceAnalysis.FastAreader(infile) orfsFound = [] f = open(outfile, 'w') sys.stdout = f for header, sequence in fastaReader.readFasta(): print(header) '''find forward and reverse comp ORFs in sequence''' finder = OrfFinder(sequence) orfList = finder.findOrfs() revOrfList = finder.findRevOrfs() '''convert orfList to clear tuples in single list for sorting''' for list in orfList: for tuple in list: frame = tuple[0] + 1 startPos = tuple[1] + 1 endPos = tuple[2] orfsFound.append((frame, startPos, endPos, tuple[3])) '''covert reverse complementary ORFs to top strand coordinates and add to single list''' for list in revOrfList: for tuple in list: frame = tuple[0] + 1 startPos = tuple[1] + 1 endPos = tuple[2] orfsFound.append((-frame, len(sequence) - endPos + 1, len(sequence) - startPos + 1, tuple[3])) '''sort by largest to smallest and by start position furthest to the left''' orfsFound.sort(key=lambda tup: (tup[3], tup[1]), reverse=True) '''print out list to file in proper format''' for orf in orfsFound: print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(orf[0], orf[1], orf[2], orf[3])) f.close()
def main(): '''Main function for the primer design program. Imports primerDesign, Bio, CommandLine, and sequenceAnalysis. Main function takes in inputs and boolean checks from the command line. These include the target sequence, restriction enzymes one and two, any changes to start and stop codons, and a verbosity check which will enable printing either to standard output or a output file. Output would appear as the following after a successful run of main: ############################################################ FastA Header Forward Primer Primer Sequence {} nucleotides were added to give {} efficiency after {} hours. Buffer {} for digestion at {} degrees. Melting Temperature Forward GC Content Percentage Forward Reverse Primer Primer Sequence {} nucleotides were added to give {} efficiency after {} hours. Buffer {} for digestion at {} degrees. Melting Temperature Reverse GC Content Percentage Reverse ############################################################ Should errors occur such as improper target sequence or (Fill in the check conditions we can think of here), a message will be displayed indicating the potential problem to the user: Error: This program has detected that (Situation). Please correct your (Situation) and try again. After the message is displayed, the program will exit and return back to the terminal line. ''' ################################################################################################### #Main method variables cl = CommandLine.Command_Line() gcForward = None gcReverse = None #Gather the restriction enzymes restrictionEnzyme1 = cl.args.enzymeOne restrictionEnzyme2 = cl.args.enzymeTwo #Gather the start and stop codon startCodon = cl.args.start stopCodon = cl.args.stop #Verbosity Boolean verb = cl.args.verbosity #File Name targetFile = cl.args.target #Marker Number markerNumber = 60 #Degree Symbol degree = "\u00b0" ################################################################################################### #Check to see if all of the required elements are in place. If any value is at none, terminate the program with a message if targetFile is None: print( "No Target Sequence inputted. Please retry with a proper input file in the appropriate location. See -h for command line help." ) sys.exit() else: pass if restrictionEnzyme1 is None: print( "No Enzyme One inputted. Please retry with an Enzyme One in the appropriate location. See -h for command line help." ) sys.exit() else: pass if restrictionEnzyme2 is None: print( "No Enzyme Two inputted. Please retry with an Enzyme Two in the appropriate location. See -h for command line help." ) sys.exit() else: pass ################################################################################################### fastA = sequenceAnalysis.FastAreader( targetFile) #Read from the file collected by CommandLine class for head, seq in fastA.readFasta( ): #Ideally there should be only one fastA to read given a run. #If we decide otherwise, place all of the class method calls within the loop createdPrimer = primerDesign.primerDesign(head, seq, restrictionEnzyme1, restrictionEnzyme2, startCodon, stopCodon) #createdPrimer.buildPrimers() #Build the primers using the built in buildPrimers method. Results are stored in the class object nucForward = sequenceAnalysis.NucParams(str(createdPrimer.forwardPrimer)) nucReverse = sequenceAnalysis.NucParams(str(createdPrimer.reversePrimer)) gcForward = (nucForward.nucComposit["G"] + nucForward.nucComposit["C"]) / nucForward.nucCount() gcReverse = (nucReverse.nucComposit["G"] + nucReverse.nucComposit["C"]) / nucReverse.nucCount() ################################################################################################### #Printing Section #Print either to an output file or std out depending upon verbosity condition if verb is True: #Verbosity mode output. If enabled, writes to a file instead of std out. with open("PrimerOut.txt", "w") as p: p.write("#" * markerNumber + "\n\n") p.write(createdPrimer.header + "\n\n") p.write("Forward Primer\n") p.write(createdPrimer.finalFwdPrimer + "\n") p.write("\n") p.write( "'{0}' nucleotides were added to give {1} efficiency after {2} hours.\n" .format( createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1] [0], createdPrimer.restrictionEnzymeDict[ createdPrimer.enzyme1][1], createdPrimer.restrictionEnzymeDict[ createdPrimer.enzyme1][2])) p.write("Buffer {} for digestion at {} Degrees.\n\n".format( createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][3], createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][4])) p.write("Melting Temperature = " + str(createdPrimer.tempOfFwd) + degree + "C" + "\n") p.write("\n") p.write(str.format("{0:.4f}", gcForward) + " % GC Content\n") p.write("\n") p.write("Reverse Primer\n") p.write(createdPrimer.finalRevPrimer + "\n") p.write("\n") p.write( "'{0}' nucleotides were added to give {1} efficiency after {2} hours.\n" .format( createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2] [0], createdPrimer.restrictionEnzymeDict[ createdPrimer.enzyme2][1], createdPrimer.restrictionEnzymeDict[ createdPrimer.enzyme2][2])) p.write("Buffer {} for digestion at {} Degrees.\n\n".format( createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][3], createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][4])) p.write("Melting Temperature = " + str(createdPrimer.tempOfRev) + degree + "C" + "\n") p.write("\n") p.write(str.format("{0:.4f}", gcReverse) + " % GC Content\n") p.write("#" * markerNumber) else: #Print to std out instead of to a file print("#" * markerNumber) print(createdPrimer.header) print() print("Forward Primer") print(createdPrimer.finalFwdPrimer) print() print( "'{0}' nucleotides were added to give {1} efficiency after {2} hours.\n" .format( createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][0], createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][1], createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][2])) print("Buffer {} for digestion at {} Degrees.\n".format( createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][3], createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme1][4])) print("Melting Temperature = " + str(createdPrimer.tempOfFwd) + "C") print() print(str.format("{0:.4f}", gcForward) + " % GC Content\n") print() print("Reverse Primer") print(createdPrimer.finalRevPrimer) print() print( "'{0}' nucleotides were added to give {1} efficiency after {2} hours.\n" .format( createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][0], createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][1], createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][2])) print("Buffer {} for digestion at {} Degrees.\n".format( createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][3], createdPrimer.restrictionEnzymeDict[createdPrimer.enzyme2][4])) print() print("Melting Temperature = " + str(createdPrimer.tempOfRev) + "C") print() print(str.format("{0:.4f}", gcReverse) + " % GC Content\n") print() print("#" * markerNumber)
def __init__(self): self.fastaReader = sequenceAnalysis.FastAreader() self.analyzer = sequenceAnalysis.NucParams()