def __init__(self, genome_path='testGenome.fa'): #TODO need docstrings self.genome = genome_path self.myReader = FastAreader(self.genome) self.nuc_params = NucParams() for head, seq in self.myReader.readFasta(): self.nuc_params.addSequence(seq)
def main(): '''A program that outputs the summaries and final display of genome parsed in''' myReader = FastAreader() #make sure to change this to use stdin myNuc = NucParams() #instantiate new NucParams object for head, seq in myReader.readFasta( ): #unpacks the header, sequence from the tuple myNuc.addSequence(seq) #adds only the sequence ot the myNuc object #calculate gc content and sequence length nucsMb = myNuc.nucCount() / 1000000 c = myNuc.nucComp['C'] g = myNuc.nucComp['G'] gcTotal = c + g gcContent = (gcTotal / myNuc.nucCount()) * 100 print('sequence length = {0:.2f} Mb \n\n' 'GC content = {1:.1f} % \n'.format(nucsMb, gcContent)) #sort codons in alpha order, by Amino Acids (values in dict) # calculate relative codon usage for each codon and print for codon, aa in sorted(myNuc.rnaCodonTable.items(), key=lambda value: (value[1], value[0])): val = myNuc.codonComp.get(codon) / myNuc.aaComp.get(aa) print('{:s} : {:s} {:5.1f} ({:6d})'.format(codon, aa, val * 100, myNuc.codonComp.get(codon)))
def main(inCL=None): ''' Creates function main that reads the FastA file, instantiates OrfFinder module and prints out ORF info with user given options. ''' if inCL is None: myCommandLine = CommandLine() if myCommandLine.args.longestGene: fastaFile = FastAreader() for header, sequence in fastaFile.readFasta(): #reads FastA file print(header) #prints header myOrfFinder = OrfFinder(sequence) #institating class OrfFinder myOrfFinder.forwardFrame() #instantiating method forwardFrame myOrfFinder.reverseFrame() #instantiating method reverseFrame #filters orf, only keeping orfs longer than minGene (user specified length) #personal note: turns out there's a lambda filter filteredOrf = filter(lambda orf:orf[3] > myCommandLine.args.minGene, myOrfFinder.orfList) #sorting orfs by decreasing length (lambda sort) #personal note: reverse = True reverses its. sortedOrf = sorted(filteredOrf, key=lambda orf:orf[3], reverse = True) #prints it out for frame, startPos, stopPos, orfLength in sortedOrf: print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(frame, startPos, stopPos, orfLength)) else: myCommandLine = CommandLine(inCL) print(myCommandLine.args)
def main(inCL=None): #Find some genes. ''' Main function. Initializes all items from args and passes them to findORFs in sequenceAnalysis.py. This will also type out all the sorted correct output into the outfile given in the namespace. ''' if inCL is None: myCommandLine = CommandLine() else: myCommandLine = CommandLine(inCL) print(myCommandLine.args) inFile = myCommandLine.args.inFile outFile = myCommandLine.args.outFile longestGene = myCommandLine.args.longestGene minGene = myCommandLine.args.minGene startCodons = myCommandLine.args.start stopCodons = myCommandLine.args.stop fReader = FastAreader(inFile) orfOut = open(outFile, 'w') for head, seq in fReader.readFasta(): Orfer = orfFinder(longestGene, minGene, startCodons, stopCodons) orfOut.write(head + "\n") Orfer.findOrf(head, seq) for frame, start, stop, length in sorted( Orfer.orfDict, key=lambda kv: (-kv[3], -kv[1]) ): # ensures everything will be printed out in descending order from gLength values then keys. orfOut.write("{:+d} {:>5d}..{:>5d} {:>5d} \n".format( frame, start, stop, length))
def main(inCL=None): ''' Find some genes. ''' if inCL is None: myCommandLine = CommandLine( ['tass2.fa', 'tass2ORFdata-ATG-100.txt', '--longestGene']) #create default values else: myCommandLine = CommandLine(inCL) #open file created orfReader = FastAreader(myCommandLine.args.inFile) with open(myCommandLine.args.outFile, 'w') as opFile: #loop through the sequences within file and calculate ORFs then STDOUT them for head, seq in orfReader.readFasta(): #reverses strand to be read bases = list(seq) reverseStrand = reverseDNA(bases) #Puts seq through program and outputs in frame file finding = OrfFinder(bases, head, myCommandLine.args.minGene, myCommandLine.args.longestGene, myCommandLine.args.start, reverseStrand) finding.findOrf(opFile) finding.findReverse(opFile) finding.writeFrameFile(opFile)
def main(inCL=None): """ Use fasta file to create gene trees for the four largest genes in related virus strains. This program takes a "combined" fasta file containing multiple genomes from similar virus strains and searches each genome for a common specific gene. These genes are then aligned to one another and converted to a .phy file that can then be used to create a detailed phylogenetic tree based on the variation between particular genes. In theory, the variation in the molecular composition of the genes will determine the trend in the variation of the genomes. This allows the "Gene Tree" to provide a rough outline of what the "Phylogenic Tree" would look like having alligned full genomes. input: An input fasta file that contains virus headers and their DNA sequence output: A written and graphical description of four gene trees of the four largest orfs in coronaviruses. The output also contains a distance matrix for each gene tree and key correlating numbers to viruses. Assumptions: - input file must follow fasta format and must be a DNA nucleotide sequence. - input file contains solely corona viruses since they contain only 4 genes that are conserved among each other """ headList = [] # Stores header of coronavirus sequences in fasta file orfList = [ ] # Stores sequences containing ORFs of coronavirus sequences in fasta file validNucs = ['A', 'C', 'G', 'T'] myReader = FastAreader('Combined-ALL-SARS-CoV.fasta') for head, seq in myReader.readFasta( ): # Using fastAreader to read in .fasta files headList.append(head) for i in seq: if i not in validNucs: # Removing non-valid bases seq = seq.replace(i, "") orf = OrfFinder( seq, 300, True ) # Includes the largest ORF greater than 300 nucleotides within a stop codon geneOrfList = orf.getOrfs() geneSeq = [] # Stores ORF sequences for openFrame in geneOrfList: geneSeq.append(seq[openFrame[1] - 1:openFrame[2] - 1]) orfList.append(geneSeq) # Calls methods to create SeqRecords and then .py file to print gene trees myPhylo = GeneTree() for i in range( 0, 4, 1): # Loops to print the first four gene trees of every sequence records = myPhylo.geneSpecificRecord( orfList, headList, i) # Creates list of SeqRecords that represent a sequence # alignments = myPhylo.fastaToPhylip(records) # Makes a .phy file using a .fasta file print("GENE " + str(i + 1) + ":") # printTree = myPhylo.printGeneTree() # Prints Gene Trees x = 0 print( '\n\n============================================ K E Y ============================================\n' ) for header in headList: # Loops through headers to print key header = header.split(',') header = header[0] print("{} = {}".format( x, header)) # Prints each line containing the header x += 1
def individualAnalysis(self, fastafile): """ This method does all the printing and calculates codon usage """ # print("Reading from {}...".format(fastafile)) self.myReader = FastAreader(fastafile) length, nucParams = self.sequenceLength(self.myReader) print("sequence length = {:.2f} Mb".format(length), "\n") self.gc = self.gcContent(nucParams) print("GC content = {:.1f} %".format(self.gc), "\n") codonCount = nucParams.codonComposition() totalAAs = 0 # Alphabatizes and loops through codonCount keys for codons in sorted(codonCount): c_count = codonCount[codons] totalAAs += c_count one_letter = nucParams.rnaCodonTable[codons] aminoCount = nucParams.aaComp[one_letter] #calculates percentage if c_count != 0: finalPercentage = (c_count/aminoCount) *100 else: finalPercentage = (c_count/1) *100 print('{} : {} {:5.1f}% ({:6d})'.format(codons, one_letter, finalPercentage, c_count)) print("Total number of amino acids counted is "+str(totalAAs))
def main(): ''' main Execute all functions in order for proper output to stdout. ''' fReader = FastAreader(sys.argv[1]) for head, seq in fReader.readFasta(): print("Generating powerset for: {}".format(head)) findUnique(head, seq) #fa.getPowerSets(seq) for rna in sorted(findUnique.rnaSetlist, key=lambda rna: rna.header): uniqueAndEssential = rna.uniqueFinder() print(rna.header) print(rna.sequence) for index, seq in sorted(uniqueAndEssential.items(), key=lambda x: x[0]): print('.' * index, seq, sep='')
def main(myCommandLine=None): ''' Implements the Usage exception handler that can be raised from anywhere in process. ''' if myCommandLine is None: myCommandLine = CommandLine([ 'tass2.fa', 'tass2ORFdata-ATG-100.txt', '--longestGene', '--start=ATG', '--minGene=100']) else : myCommandLine = CommandLine(myCommandLine) myCommandLine.args.inFile #has #the input file name outFile = myCommandLine.args.outFile #the output file name myCommandLine.args.longestGene #is True if only the longest Gene is desired myCommandLine.args.start #is a list of start codons myCommandLine.args.minGene #is the minimum Gene length to include # Clear the file if created previously, # and open it orfReader = FastAreader(myCommandLine.args.inFile) open(myCommandLine.args.outFile, 'w').close() f = open(outFile, 'a') # loop through all of the sequences in a file # and calculate the respective ORF and write them for head, seq in orfReader.readFasta(): nucParams = NucParams('') nucParams.addSequence(seq) nucParams.buildNucleotide() bases = list((''.join(nucParams.codons))) reverseBases = getReverseStrand(bases) finder = OrfFinder(bases,head,minGene=myCommandLine.args.minGene, longestGene=myCommandLine.args.longestGene, start=myCommandLine.args.start,revSeq=reverseBases) finder.findOrfs(f) finder.findRevOrfs(f) finder.writeFramesToFile(f)
def genomeAnalyzer(): ''' This genomeAnalyzer takes FastA files and prints out sorted relative codon information, sequence length information in Mb, and GC % content of the given genome. ''' myReader = FastAreader( '/Users/stephaniegardner/Desktop/BME160/Lab04/HomoSapiensMitochondrion.fa' ) #instantiation of FasAreader class myNuc = NucParams() #instantiation of NucParams class for head, seq in myReader.readFasta(): #usage of FastAreader class myNuc.addSequence(seq) #usage of NucParams class '''Sequence length: takes total nucleotide counts and converts to Mb''' length = myNuc.nucCount() / 1000000 print('sequence length = {:.2f}Mb'.format(length), "\n") '''GC Content: adds all G's and C's found in nucComp dictionary and divides by total nucleotides''' nucComp = myNuc.nucComposition() gc = nucComp['G'] + nucComp['C'] gc = gc / myNuc.nucCount() print('GC Content = {:.2%}'.format(gc), "\n") #Individual Codon Analysis codonComp = myNuc.codonComposition() aaComp = myNuc.aaComposition() for codon, aa in sorted(myNuc.rnaCodonTable.items(), key=lambda t: t[1] + t[0]): #^^crazy lambda function Logan helped me with. It dictates what is a codon and what is an amino acid in rnaCodonTable total = aaComp[ aa] #total number of the certain amino acid being iterated codonCount = codonComp[ codon] #total number of codons associated with amino being iterated if total != 0: val = (codonCount / total) #^^takes the amount of codon for a certain amino and divdes it by total times the amino occured in the sequence else: val = (codonCount / 1) #^^ if the amino acid was not found in the genome, its value is 0% print('{} : {} {:5.1f}% ({:6d})'.format(codon, aa, val * 100, codonCount))
def main(): tRnaFinder = FastAreader('') for head,seq in tRnaFinder.readFasta(): allPowerSets = tRNA(head,seq) #makes powerSet for each trna sequence sortTrnas = sorted(tRNA.tRNAobjects, key = lambda t:t.head) #sorts the tRNAs so they can be iterated through alphabetically for eachtRNA in tRNA.tRNAobjects: eachtRNA.buildUniques() eachtRNA.findEssentials() for everyTrna in sortTrnas: print(everyTrna.head) print(everyTrna.seq) sortedtRna = sorted(everyTrna.essentialSubs, key=lambda x:everyTrna.seq.find(x)) #Lisa also here for e in sortedtRna: position = everyTrna.seq.find(e) print('{}{}'.format('.' * position, e))
def main(): ''' Function main that uses imported classes NucParams and FastAreader to print out genome length, GC nucleotide content, and codon frequency for each codon. ''' myReader = FastAreader() #instance of fastA file myNuc = NucParams() #instance of NucParams for head, seq in myReader.readFasta(): myNuc.addSequence(seq) #Prints sequence length. seqLength = myNuc.nucCount() mbSeqLength = seqLength / 1000000 print("sequence length: {0:0.2f}Mb".format(mbSeqLength), "\n") #Printes GC content. nucComp = myNuc.nucComposition() gAndC = nucComp["G"] + nucComp["C"] gcContent = gAndC / myNuc.nucCount() print("GC content = {0:0.1%}%".format(gcContent), "\n") #Prints relative codon frequency and codon count for each codon. aaComp = myNuc.aaComposition() rnaCodonComp = myNuc.codonComposition() sortedRNATable = sorted( myNuc.rnaCodonTable.items(), key=lambda x: x[1]) #sorts aa by alphabetical order. for codon, aa in sortedRNATable: #for each codon in rna codon table aaNumber = aaComp[myNuc.rnaCodonTable[ codon]] #denominator for relative codon frequency calculation. codonNumber = myNuc.rnaCodonCompDict[codon] #numerator if aaNumber > 0: codonFreq = codonNumber / aaNumber #relative codon frequency elif aaNumber == 0: codonFreq = 0 print("{:s} : {:s} {:5.1f} ({:6d})".format(codon, aa, codonFreq * 100, codonNumber), end=" ")
class genomeAnalyzer: # TODO you need docstrings here like I did for sequenceAnalysis def __init__(self, genome_path='testGenome.fa'): #TODO need docstrings self.genome = genome_path self.myReader = FastAreader(self.genome) self.nuc_params = NucParams() for head, seq in self.myReader.readFasta(): self.nuc_params.addSequence(seq) def genomeAnalysis(self): """Compute and print sequence length, gc content and amino acid composition""" length = self.sequenceLength() # Alex, you should not change the length of the sequence until you format it because if you need to use the # function for something else you would have to change a bunch of stuff for it to work print("sequence length = {:.2f}Mb\n".format(length / 1000000.0)) # same idea goes for gcContent. It should return the gcContent and if you want to convert to percentage, you can gc = self.gcContent() print('GC content = {:.1f}%\n'.format(gc * 100)) codon_counts = self.nuc_params.codonComposition() aa_comp = self.nuc_params.aaComposition() # go through each amino acid for aa in sorted(aa_comp): # created another data structure aaTable in nuc_params to deal with going from aa to RNA codon for codon in sorted(self.nuc_params.aaRnaTable[aa]): codon_total = codon_counts[codon] aa_count = aa_comp[aa] # calculate relative codon usage for each codon and print if aa_count != 0: total = (codon_total / aa_count) * 100 else: print("This happens") total = (codon_total / 1) * 100 print('{} : {} {:5.1f}% ({:6d})'.format( codon, aa, total, codon_total)) def gcContent(self): """Compute GC content of entire fasta file""" nuc_comp = self.nuc_params.nucComposition() gc = nuc_comp['G'] + nuc_comp['C'] gc = gc / self.nuc_params.nucCount() return gc def sequenceLength(self): """Computes the sequence length of a given genome :return length, nucParams class object """ return self.nuc_params.nucCount()
def main(inCL=None): """Reads in a fasta file and outputs the ORFs frame, start, stop, and length position on a output file.""" if inCL is None: myCommandLine = CommandLine() if myCommandLine.args.longestGene: fastaFile = FastAreader() for header, sequence in fastaFile.readFasta(): print(header) orfData = OrfFinder(sequence) orfData.findOrfs() orfData.findRevOrfs() filteredList = filter( lambda orf: orf[3] > myCommandLine.args.minGene, orfData. orfs) # Filters out the ORFS depending on the minGene arg. for frame, start, stop, length in sorted( filteredList, key=lambda orf: orf[3], reverse=True): # Sorts the list of ORFs by length. print('{:+d} {:>5d}..{:>5d} {:>5d}'.format( frame, start, stop, length)) else: myCommandLine = CommandLine(inCL) print(myCommandLine.args)
def individualAnalysis(self, fastafile): print('Delete line 20, reading from {}".format(fastafile)') myRead = FastAreader(fastafile) length, nucParams = self.sequenceLength(myRead) print("sequence length = {:1.2f} Mb".format(length)) print("\n") gc = self.gcContent(nucParams) print("GC content = {:2.1}%".format(gc)) print("\n") codonCount = nucParams.codonComposition() aminosComp = nucParams.aminoCompo() nucComp = nucParams.nucComposition for codon in sorted(codonCount): total = codonCount[codon] amino = nucParams.rnaCodonTable[codon] aminoCount = nucParams.aminoAcid.count(amino) finalTotal = total / aminoCount * 100 print('{} : {} {:5.1f}% ({:6d})'.format(codon, amino, finalTotal, total))
Notes: The headers in the positiveModomics.fa have to be in the Henry format The headers in teh unsorted.fa have to be in the gtRNAdb 11/18/2020 format ''' sortedModomics = sys.argv[1] modifier = sortedModomics.replace('-', '_').split('_')[2] fastaFile = sys.argv[2] base, ext = fastaFile.split('.') newFileName_pos = f'{base}_{modifier}.{ext}' newFileName_neg = f'{base}_neg.{ext}' sortedHeaderSet = set() headerSet = set() for sortedHeader, sortedSequence in FastAreader(sortedModomics).readFasta(): sortedHeaderSet.add( tuple(sortedHeader.replace('Ini', 'iMet').split('-')[2:4])) pass with open(newFileName_pos, 'w') as posFile: with open(newFileName_neg, 'w') as negFile: for header, sequence in FastAreader(fastaFile).readFasta(): thing = tuple(header.split('-')[1:3]) headerSet.add(tuple(header.split('-')[1:3])) if thing in sortedHeaderSet: posFile.write(f'>{header}\n{sequence}\n') else: negFile.write(f'>{header}\n{sequence}\n')
sys.path.insert(2, "../../../CMPipelines/GenerateCM") import os from sequenceAnalysis import FastAreader import pandas as pd bigOlList = [] filepath = "./Potentially massive folder/speciesModDB/" for org in os.listdir(filepath): #print(org) for mod in os.listdir(filepath + org): #print("\t"+mod) for pos in os.listdir(filepath + org + '/' + mod): #print("\t\t"+pos) fullFilePath = filepath + org + '/' + mod + '/' + pos for header, sequence in FastAreader(fullFilePath).readFasta(): #print("\t\t\t"+header) genus, species, organismShort = org.split('_')[0], org.split( '_')[1], org.split('_')[2] organismName = org.split('_')[0] + ' ' + org.split('_')[1] actualMod = mod.split('-')[len(mod.split('-')) - 1] actualPos = pos.split('_')[1] isotype = header.split('-')[2] isoacceptor = header.split('-')[3] isodecoder = header.split('-')[4] bigOlList.append([ genus, species, organismName, organismShort, actualMod, actualPos, isotype, isoacceptor, isodecoder ])
import sys sys.path.insert(1, "C:\\Users\\mattk\\Desktop\\python scripts\\todd stuff\\BioTools") from sequenceAnalysis import FastAreader import pandas as pd ''' Given an aligned fasta (yeast) file This program will write the output to a csv ''' fileToDisplay = "sacCer3-trnaalign.fa" tRNAdic = {} for header, seq in FastAreader(fileToDisplay).readFasta(): header = header.split(' ')[0] tRNAdic[header] = [position for position in seq] #for yeast columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17a', '17b', '18', '19', '20a', '20b', '20c', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10', 'e11', 'e12', 'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65a', '65b', '65c', '65d', '65e', '70', '71', '72', '73', '74', '75', '76',
class GraphicRep: ''' reads and prepare files to be implemented in a matplotlib graphical representation of mtDNA. Can eventially be used for different circular DNA ''' myReader = FastAreader('HomoSapiensMitochondrion.fa') for head, seq in myReader.readFasta(): head = head seq = seq excelFile = 'mtDNACoding.xlsx' #names workbook funcLoc = pd.read_excel(excelFile) #loads speadsheet excelFile1 = 'MitoMap.xlsx' #names workbook mitoMap = pd.read_excel(excelFile1) locDict = {} mutDict = {} for j in range(0,38): #parses through function localtoin spread sheet locus = funcLoc.iloc[j,0] #creates dictionary with key - locus and value - list of start, stop and name of locus locDict.setdefault(locus,[]) locDict[locus].append(funcLoc.iloc[j]) locDictValues = list(locDict.values()) #[locus][start][stop][desc] // [0][0][2] --> stop pos of locus 1 for j in range(0,330): #parses through mutation spread sheet and saves to dictionary locus = mitoMap.iloc[j,1] #dictionary key - locus and value - start, stop, Disease,Allele,RNA,Homoplasmy,Heteroplasmy,pathogenicity mutDict.setdefault(locus,[]) mutDict[locus].append(mitoMap.iloc[j]) mutDictValues = list(mutDict.values()) #saves vales to a list to be used later on def optionA(self,locus): ''' OPTION A: input desired locus for viewing/analysis base mutations in a certain Locus calculates % of mutations at that Locus pathogenictiy of certain locus saves all info to a text file ''' if locus is None: pass else: with open(locus+'MutationInfo', 'w') as f: #prepare file to save try: length = ((len(list(GraphicRep.mutDict[locus])))/330) * 100 #average of mutations at that locus over total mutations f.write("Percent of total mutations found at this locus are: {0:0.2f} % \n".format(length)) paths = 0 i = 0 x=float('nan') for item in GraphicRep.mutDict[locus]: #adds up all the pathogenicities to be averaged out i += 1 if math.isnan(item[7]) == False: path = item[7] paths += float(path) #online pathAvg = paths/i #average = sum of all pathogenicity levels over the count of pathogenicities taken if pathAvg == 0.00: f.write('No pathogenicity data is available for this locus') else: f.write("Average Pathogenicity percentage for this locus is: {0:0.2f}%\n".format(pathAvg)) except KeyError: f.write('No mutation data is available for this locus') def optionB(self,low,high,disease): ''' OPTION B: input pathogenicity interval [0-100] prints loci that have mutations with that pathogenicity prints specific base mutations with that pathogenicity ''' if (low or high) is None: pass else: with open('mtDNAPathogInfo', 'w') as f: if disease is False: f.write("Sorted based on position\nPathogenicity ({},{})- Codes for - Base Mutation".format(low,high)) for i in range(0,25): #for each locus for j in range(0,44): # for all mutations in each locus try: val = float(GraphicRep.mutDictValues[i][j][7]) if low <= val <= high: #if value is inbetween low and high interval save to file f.write("{}% - {} - {}\n".format(GraphicRep.mutDictValues[i][j][7],GraphicRep.mutDictValues[i][j][4],GraphicRep.mutDictValues[i][j][3])) #[locus][entry number][column head] except IndexError: pass if disease is True: f.write("Sorted based on position\nPathogenicity ({},{})- Codes for - Base Mutation - Linked Disease\n".format(low,high)) for i in range(0,25): #for each locus for j in range(0,44): # for all mutations in each locus try: val = float(GraphicRep.mutDictValues[i][j][7]) if low <= val <= high: #if value is inbetween low and high interval save to file f.write("{}% - {} - {} - {}\n".format(GraphicRep.mutDictValues[i][j][7],GraphicRep.mutDictValues[i][j][4],GraphicRep.mutDictValues[i][j][3],GraphicRep.mutDictValues[i][j][2])) #[locus][entry number][column head] except IndexError: pass f.write('\n\nLHON-Leber Hereditary Optic Neuropathy\nMM-Mitochondrial Myopathy\nAD-Alzeimers Disease\nLIMM-Lethal Infantile Mitochondrial Myopathy\nADPD-Alzeimers Disease and Parkinsonss Disease\nMMC-Maternal Myopathy and Cardiomyopathy\nNARP-Neurogenic muscle weakness, Ataxia, and Retinitis Pigmentosa\nFICP-Fatal Infantile Cardiomyopathy Plus, a MELAS-associated cardiomyopathy\nMELAS-Mitochondrial Encephalomyopathy, Lactic Acidosis, and Stroke-like episodes\nLDYT-Lebers hereditary optic neuropathy and DYsTonia\nMERRF-Myoclonic Epilepsy and Ragged Red Muscle Fibers\nMHCM-Maternally inherited Hypertrophic CardioMyopathy\nCPEO-Chronic Progressive External Ophthalmoplegia\nKSS-Kearns Sayre Syndrome\nDM-Diabetes Mellitus\nDMDF-Diabetes Mellitus + DeaFness\nCIPO-Chronic Intestinal Pseudoobstruction with myopathy and Ophthalmoplegia\nDEAF-Maternally inherited DEAFness or aminoglycoside-induced DEAFness\nPEM-Progressive encephalopathy\nSNHL-SensoriNeural Hearing Loss\n') # def optionC(self,locus,plasmy): # ''' # OPTION C: # input hetero or homoplasmy # prints base mutations that correspond with... # # ''' # if plasmy == True: # with open('Hetero/HomoplasmyInfo', 'w') as f: # # # for i in range(0,25): #for each locus # for j in range(0,44): # for all mutations in each locus # try: # if locus == GraphicRep.mutDictValues[i][j][1]: # homoplasmy = GraphicRep.mutDictValues[i][j][5] # heteroplasmy = GraphicRep.mutDictValues[i][j][6] # if homoplasmy == '+' and h**o == True: # f.write('Homoplasmic Mutations: \n{} - {}\n').format(GraphicRep.mutDictValues[i][j][4],GraphicRep.mutDictValues[i][j][3])) # if heteroplasmy == '+' and hetero == True: # f.write('Heteroplasmic Mutations: \n{}-{}\n').format(GraphicRep.mutDictValues[i][j][4],GraphicRep.mutDictValues[i][j][3])) # except IndexError: # pass # else: # pass def get_cmap(n, name='hsv'): '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct RGB color; the keyword argument name must be a standard mpl colormap name. https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib''' return plt.cm.get_cmap(name, n) def createMap(self): ''' creates map of mitochondira DNA with color coded mutations and loci ''' circumf = len(GraphicRep.seq) #circumference is equal to the length of the sequence baseLen = circumf/len(GraphicRep.seq) #each base is len of 1 xs = circumf/len(GraphicRep.seq) #baseLen baseList = [] baseList = np.arange(0,circumf,xs) #populates base list with a list of base position #and their corresponding position on the circle figure cmap = GraphicRep.get_cmap(38) #creates a color map of 38 colors fig,ax = plt.subplots() #make figure ax = plt.subplot(polar=True) #make polar axis ax.set_aspect('equal') #set figure to print out with equal dimensions ax.set_xticklabels(['Origin','','','','']) #set origin to the top of the figure ax.grid() #clear theta grid from figure ax.set_rticks([]) #clear radius ticks from figure ax.set_theta_zero_location('N',offset = 0) #set the 0 location to the top of circle x=float('nan') #used in conditional (if x is not a number...) for i in range(0,25): #locus for j in range(0,44): #mutation (max number of mutations any of the loci have) try: #makes each pathogenicity interval a special color val = float(GraphicRep.mutDictValues[i][j][7]) if (0.0 <= val <= 20.0): colors = 'purple' elif (21.0 <= val <= 40.0): colors = 'blue' elif (41.0 <= val <= 60.0): colors = 'green' elif (61.0 <= val <= 80.0): colors = 'yellow' elif (81.0 <= val <= 99.0): colors = 'orange' elif (val == 100.0): colors = 'red' elif (math.isnan(val) == True): colors = 'black' mutPos = GraphicRep.mutDictValues[i][j][0] theta=((np.pi*2)/len(baseList))*mutPos#calculate theta #populate graph with mutation markers (|) that are roated the correspoding degree ax.plot(theta,0,marker = (2,0,(theta*(180/np.pi))),color=colors,markersize=20,zorder = 6) except IndexError: pass #try except needed for locus that have fewer mutations than the max (44) for i in range(0,38): #for each locus locusName = GraphicRep.locDictValues[i][0][0] start = GraphicRep.locDictValues[i][0][1] stop = GraphicRep.locDictValues[i][0][2] thretaa = ((np.pi*2)/len(baseList)) #plot gray markers where each locus starts ax.plot(thretaa*start,0,marker = (2,0,(thretaa*start)*(180/np.pi)),color='gray',markersize=60,zorder=6) for k in range(GraphicRep.locDictValues[i][0][1],GraphicRep.locDictValues[i][0][2]): theta = ((np.pi*2)/len(baseList))*k #plot all bases in a locus and switch colors at the end of the locus ax.plot(theta,0,marker = (2,0,(theta*(180/np.pi))),color=cmap(i),markersize=40,zorder=5) ax.annotate('D Loop', xy=(0, 0), # theta, radius xytext=(.3, .9), # fraction, fraction textcoords='figure fraction', zorder =8,#'offset points' : Specify an offset (in points) from the xy value arrowprops=dict(width=.05, headwidth = 2, facecolor='black', shrink=.05), horizontalalignment='left', verticalalignment='bottom') y = .90 for i in range(0,15): #for each locus locusNameL = GraphicRep.locDictValues[i][0][0] startL = GraphicRep.locDictValues[i][0][1] stopL = GraphicRep.locDictValues[i][0][2] thetaL = ((np.pi*2)/len(baseList)) ax.annotate(locusNameL, xy=(thetaL*startL, .01), # theta, radius xytext=(.1, y), # fraction, fraction textcoords='figure fraction', zorder =8,#'offset points' : Specify an offset (in points) from the xy value arrowprops=dict(width=.05, headwidth = 2, facecolor='black', shrink=.05), horizontalalignment='left', verticalalignment='bottom') y = y - .06 x = .17 for i in range(15,27): locusNameL = GraphicRep.locDictValues[i][0][0] startL = GraphicRep.locDictValues[i][0][1] stopL = GraphicRep.locDictValues[i][0][2] thetaL = ((np.pi*2)/len(baseList)) ax.annotate(locusNameL, xy=(thetaL*startL, .01), # theta, radius xytext=(x, .05), # fraction, fraction textcoords='figure fraction', zorder =8,#'offset points' : Specify an offset (in points) from the xy value arrowprops=dict(width=.05, headwidth = 2, facecolor='black', shrink=.05), horizontalalignment='left', verticalalignment='bottom') x = x + .06 z = .14 for i in range(27,38): #for each locus locusNameL = GraphicRep.locDictValues[i][0][0] startL = GraphicRep.locDictValues[i][0][1] stopL = GraphicRep.locDictValues[i][0][2] thetaL = ((np.pi*2)/len(baseList)) ax.annotate(locusNameL, xy=(thetaL*startL, .01), # theta, radius xytext=(.9, z), # fraction, fraction textcoords='figure fraction', zorder =8,#'offset points' : Specify an offset (in points) from the xy value arrowprops=dict(width=.05, headwidth = 2, facecolor='black', shrink=.05), horizontalalignment='left', verticalalignment='bottom') z = z + .07 plt.savefig('/Users/stephaniegardner/Desktop/BME160/FinalProject/mtDNAMutations.png') def zoomPlot(self,locus): ''' zooms in on a specified locus given in the command line and saves figure to file with appropriate locus name. First half of this method is the same as createMap. ''' if locus is None: pass else: circumf = len(GraphicRep.seq) baseLen = circumf/len(GraphicRep.seq) xs = circumf/len(GraphicRep.seq) #baseLen baseList = [] baseList = np.arange(0,circumf,xs) cmap = GraphicRep.get_cmap(38) figz = plt.figure() axz = figz.add_subplot(111, polar=True) axz.set_aspect('equal') axz.set_xticklabels(['Origin','','','','']) axz.grid() axz.set_rticks([]) axz.set_theta_zero_location('N',offset = 0) x=float('nan') for i in range(0,25): #locus for j in range(0,44): #mutation try: val = float(GraphicRep.mutDictValues[i][j][7]) if (0.0 <= val <= 20.0): colors = 'purple' elif (21.0 <= val <= 40.0): colors = 'blue' elif (41.0 <= val <= 60.0): colors = 'green' elif (61.0 <= val <= 80.0): colors = 'yellow' elif (81.0 <= val <= 99.0): colors = 'orange' elif (val == 100.0): colors = 'red' elif (math.isnan(val) == True): colors = 'black' mutPos = GraphicRep.mutDictValues[i][j][0] theta=((np.pi*2)/len(baseList))*mutPos axz.plot(theta,0,marker = (2,0,(theta*(180/np.pi))),color=colors,markersize=200,zorder = 6,label=GraphicRep.mutDictValues[i][j][2]) except IndexError: pass for i in range(0,38): #for each locus locusName = GraphicRep.locDictValues[i][0][0] start = GraphicRep.locDictValues[i][0][1] stop = GraphicRep.locDictValues[i][0][2] thretaa = ((np.pi*2)/len(baseList)) axz.plot(thretaa*start,0,marker = (2,0,(thretaa*start)*(180/np.pi)),color='gray',markersize=150,zorder=6) for k in range(GraphicRep.locDictValues[i][0][1],GraphicRep.locDictValues[i][0][2]): theta = ((np.pi*2)/len(baseList))*k axz.plot(theta,0,marker = (2,0,(theta*(180/np.pi))),color=cmap(i),markersize=100,zorder=5) ''' to zoom in on a certain locus, theta min and max must be reset to accomodate the locus for viewing. ''' for p in range(0,38): if locus == GraphicRep.locDictValues[p][0][0]: thetamin = (360/len(baseList))*GraphicRep.locDictValues[p][0][1] thetamax = (360/len(baseList))*GraphicRep.locDictValues[p][0][2] axz.set_thetamin(thetamin-5) axz.set_thetamax(thetamax+5) plt.title(locus) #titles the figure with locus name plt.savefig('/Users/stephaniegardner/Desktop/BME160/FinalProject/'+locus+'zoomPlot.png')
def main(): """ Main function takes in file Parses arguments and runs substring sorter """ #checks to see if file is entered, if not, print error and exit if len(sys.argv) != 2: print("Please enter a file after the program") sys.exit() #if file is entered, initialize it to infile if len(sys.argv) == 2: infile = open(sys.argv[1]) #print(orfReader.readFastaStdIn(infile)) #create lists for future storage allHeadsList = [] subSequence = [] seqSubseqDict = {} seqUniqueSubseqDict = {} substringMaker = SubstringGenerator() nucParam = NucParams('') orfReader = FastAreader() #takes infile, saves cleanseq and subseq to dictionary #makes list of all subseqs #makes list of all heads for head, seq in orfReader.readFastaStdIn(infile): cleanSeq = str(nucParam.stripSequence(seq)) subSequence = list(substringMaker.get_substrings(cleanSeq)) seqSubseqDict.update({cleanSeq: subSequence}) allHeadsList.append(head) allSubSequencesList.append((subSequence)) #make all else subseq list #subseq - allElse = uniques #save them to seqUnique dic i = 0 #print(seqSubseqDict) for seq in seqSubseqDict: allElseSubseq = [] for otherSeq in seqSubseqDict: if otherSeq != seq: allElseSubseq.append((seqSubseqDict[otherSeq])) tempListSubseq = seqSubseqDict[seq] print(type(tempListSubseq)) setTempListSubseq = set(tempListSubseq) setAllElse = set() for item in allElseSubseq: setAllElse.add(item) uniqueSubList = setTempListSubseq - setAllElse seqUniqueSubseqDict = {seq: uniqueSubList} FindUnique(seq, allHeadsList[i], seqSubseqDict[seq]) i += 1 FindUnique(allHeadsList[0], seq, seqSubseqDict[seq]) tRNAList = [] headLen = len(allHeadsList) baseIndex = 0 thisHeadPos = 0 thisBasesPos = 1 #print("SeqSub len: " + str(len((seqSubseqDict)))) for thisBase in seqSubseqDict: #tempSeq = allSubSequencesList.copy() #deletes the current sequence #del tempSeq[baseIndex] #tempUnique = list() #joins all the sequences into tempUnique #tempUnique.extend((tempSeq)) tRNAList.append( FindUnique(allHeadsList[baseIndex].lstrip(), thisBase, seqSubseqDict[thisBase])) baseIndex += 1 if thisBasesPos + 1 < headLen and thisHeadPos < headLen: thisHeadPos += 1 thisBasesPos += 1 #print(len(tRNAList)) for trna in tRNAList: trna.printSubstrings()
combinations.append(goingBackward) return combinations def makeDirectory(): directoryName = fastafile.split('.')[0] + '_Sliced' path = os.path.join(whereToMake, directoryName) try: os.mkdir(path) except FileExistsError: pass return path #important stuff # 0 1 2 3 4 #'AA-stem', 'D-arm', 'Anticodon loop', 'Introns', 'T-arm' directory = makeDirectory() for desiredRegions in generateCombinations(): newFasta = '' for header, sequence in FastAreader(fastafile).readFasta(): windowMaker = CreateWindows(secondaryStructure, sequence=sequence, desiredRegions=desiredRegions) newFasta += f'>{header}\n{windowMaker.getWindowedSequence()}\n' fastaName = "_".join(desiredRegions) + '_Sliced.fa' with open(os.path.join(directory, fastaName), 'w') as file: file.write(newFasta)
EVALS = 'eValues.txt' positiveFasta = "hasM1A.fa" with open(NAMES) as nameFile: with open(EVALS) as evalFile: NAMES = [] EVALS = [] COLOR = [] for name, evalue in zip(nameFile.readlines(), evalFile.readlines()): #print(name[0:name.find(' ')], -math.log10(float( evalue ))) NAMES.append(name[0:name.find(' ')]) EVALS.append(-math.log10(float(evalue))) headers = [] for header, sequence in FastAreader(positiveFasta).readFasta(): headers.append(header.split(' ')[0]) for name in NAMES: inTest = False for header in headers: #print(name, header) if name.find(header) > -1: inTest = True break if inTest: COLOR.append('green') else: COLOR.append('black')
def main(): """ Function imports necessary classes from sequenceAnalysis.py module, uses FastAreader class to open and read the fasta files. the main() defines two object through the NucParams class, uses the methods in NucParams class to get the counts of the characters (nucleotide, codon, amino acid). """ #imports FastAreader and NucParams classes from sequenceAnalyzer module from sequenceAnalysis import FastAreader, NucParams #read the 1st fasta file by FastAreader genome1 = FastAreader('vulgaris.fasta') #read the 2nd fasta file by FastAreader genome2 = FastAreader('Shewanella.fasta') #Makes an object called seq1 form NucParams class seq1 = NucParams('') #Makes an object called seq2 form NucParams class seq2 = NucParams('') # for head, sequence in the 1st genomeFasta file for headx, seqx in genome1.readFasta(): #use addSequence method to update the dictionaries seq1.addSequence(seqx) #define GC content for the 1st genome GC1 = ((seq1.nucDic.get('G') + seq1.nucDic.get('C'))\ /(seq1.nucCount())*100) print(seqx) # for head, sequence in the 1st genomeFasta file for heady, seqy in genome2.readFasta(): #use addSequence method to update the dictionaries seq2.addSequence(seqy) #define GC content for the 2nd genome GC2 = ((seq2.nucDic.get('G') + seq2.nucDic.get('C'))\ /(seq2.nucCount())*100) ### PRINT STATEMENTS ### #prints the sequence length of the two genomes print ("Genome1 = %s" %headx) print ("Genome2 = %s" %heady) print("sequence lengths: Genome1 = %.2f Mb Genome2 = %.2f Mb"\ %((seq1.nucCount()/(1000000)),(seq2.nucCount()/(1000000)))) #blank line print("") #prints GC contents of the two genomes print ("GC contents: Genome1 = %.1f%% Genome2 = %.1f%%" \ %(GC1, GC2)) #blank line print("") #prints genome1 and genome2 for organization print(" Genome1 Genome2") #blank line print ("") """ readCounter = {} for nuc in seq1: read = seq[nuc:nuc+150] if read in seq2: readCounter[read] +=1 print (readCounter) """ #rnaCodonTable dictionary from the NucParams class is used for relative codon usage and amino acid composition #sorts the rnaCodonTable and gets access to keys and values #do try:, exceptKeyError: format to address when we reach three stop codon with '-' value for keys, values in sorted(seq1.rnaCodonTable.items()) and sorted(seq2.rnaCodonTable.items()): #getting keys(codon)and values(aminoacid) from the rnaCodonTable try: xxx = keys # defining the key as the 3letter codon A = values # defining the values as one letter Amino acid D1 = seq1.codonDic[xxx] #for the 1st genome, getting the count of the codons from the codon Dictionary aaCount1 = seq1.aaDic[A] #for the 1st genome, getting the amino acid count from the aminoacid dictionary F1 = ((D1/aaCount1)*1000) # for the 1st genome, frequency is codon count over amino acid count times 100 D2 = seq2.codonDic[xxx] # for the 2nd genome, getting the count of the codons from the codon Dictionary aaCount2 = seq2.aaDic[A]# for the 2nd genome, getting the amino acid count from the aminoacid dictionary F2 = ((D2/aaCount2)*1000) #relative frequencyfor the 2nd genome except KeyError: continue #prints two genomes codon usage right next to each other. print ("%s : %s %5.1f (%6d) %s : %s %5.1f (%6d)" % (xxx, A, F1, D1, xxx, A, F2, D2))