Beispiel #1
0
 def __init__(self, genome_path='testGenome.fa'):
     #TODO need docstrings
     self.genome = genome_path
     self.myReader = FastAreader(self.genome)
     self.nuc_params = NucParams()
     for head, seq in self.myReader.readFasta():
         self.nuc_params.addSequence(seq)
Beispiel #2
0
def main():
    '''A program that outputs the summaries and final display of genome parsed in'''
    myReader = FastAreader()  #make sure to change this to use stdin
    myNuc = NucParams()  #instantiate new NucParams object
    for head, seq in myReader.readFasta(
    ):  #unpacks the header, sequence from the tuple
        myNuc.addSequence(seq)  #adds only the sequence ot the myNuc object

    #calculate gc content and sequence length
    nucsMb = myNuc.nucCount() / 1000000

    c = myNuc.nucComp['C']
    g = myNuc.nucComp['G']

    gcTotal = c + g

    gcContent = (gcTotal / myNuc.nucCount()) * 100
    print('sequence length = {0:.2f} Mb \n\n'
          'GC content = {1:.1f} % \n'.format(nucsMb, gcContent))

    #sort codons in alpha order, by Amino Acids (values in dict)
    # calculate relative codon usage for each codon and print
    for codon, aa in sorted(myNuc.rnaCodonTable.items(),
                            key=lambda value: (value[1], value[0])):
        val = myNuc.codonComp.get(codon) / myNuc.aaComp.get(aa)
        print('{:s} : {:s} {:5.1f} ({:6d})'.format(codon, aa, val * 100,
                                                   myNuc.codonComp.get(codon)))
def main(inCL=None):
    ''' 
    Creates function main that reads the FastA file, instantiates OrfFinder module and prints out 
    ORF info with user given options. 
    '''

    if inCL is None:
        myCommandLine = CommandLine()
        
        if myCommandLine.args.longestGene:
            fastaFile = FastAreader()
            
            for header, sequence in fastaFile.readFasta(): #reads FastA file
                print(header) #prints header
                
                myOrfFinder = OrfFinder(sequence) #institating class OrfFinder
                myOrfFinder.forwardFrame() #instantiating method forwardFrame 
                myOrfFinder.reverseFrame() #instantiating method reverseFrame
                
                #filters orf, only keeping orfs longer than minGene (user specified length)
                #personal note: turns out there's a lambda filter
                filteredOrf = filter(lambda orf:orf[3] > myCommandLine.args.minGene, myOrfFinder.orfList)
                
                #sorting orfs by decreasing length (lambda sort)
                #personal note: reverse = True reverses its. 
                sortedOrf = sorted(filteredOrf, key=lambda orf:orf[3], reverse = True) 
                
                #prints it out 
                for frame, startPos, stopPos, orfLength in sortedOrf:
                    print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(frame, startPos, stopPos, orfLength))
    else:
        myCommandLine = CommandLine(inCL)
    print(myCommandLine.args)
Beispiel #4
0
def main(inCL=None):

    #Find some genes.
    '''
    Main function. Initializes all items from args and passes them to findORFs in sequenceAnalysis.py. This will also type out all the sorted correct output
    into the outfile given in the namespace.
    '''
    if inCL is None:
        myCommandLine = CommandLine()
    else:
        myCommandLine = CommandLine(inCL)
    print(myCommandLine.args)
    inFile = myCommandLine.args.inFile
    outFile = myCommandLine.args.outFile
    longestGene = myCommandLine.args.longestGene
    minGene = myCommandLine.args.minGene
    startCodons = myCommandLine.args.start
    stopCodons = myCommandLine.args.stop

    fReader = FastAreader(inFile)
    orfOut = open(outFile, 'w')
    for head, seq in fReader.readFasta():
        Orfer = orfFinder(longestGene, minGene, startCodons, stopCodons)
        orfOut.write(head + "\n")
        Orfer.findOrf(head, seq)
        for frame, start, stop, length in sorted(
                Orfer.orfDict, key=lambda kv: (-kv[3], -kv[1])
        ):  # ensures everything will be printed out in descending order from gLength values then keys.
            orfOut.write("{:+d} {:>5d}..{:>5d} {:>5d} \n".format(
                frame, start, stop, length))
Beispiel #5
0
def main(inCL=None):
    '''
    Find some genes.
    '''
    if inCL is None:
        myCommandLine = CommandLine(
            ['tass2.fa', 'tass2ORFdata-ATG-100.txt', '--longestGene'])

        #create default values
    else:
        myCommandLine = CommandLine(inCL)

    #open file created
    orfReader = FastAreader(myCommandLine.args.inFile)
    with open(myCommandLine.args.outFile, 'w') as opFile:

        #loop through the sequences within file and calculate ORFs then STDOUT them
        for head, seq in orfReader.readFasta():

            #reverses strand to be read
            bases = list(seq)
            reverseStrand = reverseDNA(bases)

            #Puts seq through program and outputs in frame file
            finding = OrfFinder(bases, head, myCommandLine.args.minGene,
                                myCommandLine.args.longestGene,
                                myCommandLine.args.start, reverseStrand)
            finding.findOrf(opFile)
            finding.findReverse(opFile)
            finding.writeFrameFile(opFile)
Beispiel #6
0
def main(inCL=None):
    """
    Use fasta file to create gene trees for the four largest genes in related virus strains.
    
    This program takes a "combined" fasta file containing multiple genomes from similar virus strains and searches each 
    genome for a common specific gene. These genes are then aligned to one another and converted to a .phy file that can
    then be used to create a detailed phylogenetic tree based on the variation between particular genes. In theory, the
    variation in the molecular composition of the genes will determine the trend in the variation of the genomes. This allows
    the "Gene Tree" to provide a rough outline of what the "Phylogenic Tree" would look like having alligned full genomes.
    
    input: An input fasta file that contains virus headers and their DNA sequence
    output: A written and graphical description of four gene trees of the four largest orfs in coronaviruses. The output also
    contains a distance matrix for each gene tree and key correlating numbers to viruses.
    
    Assumptions:
    - input file must follow fasta format and must be a DNA nucleotide sequence.
    - input file contains solely corona viruses since they contain only 4 genes that are
    conserved among each other
    """
    headList = []  # Stores header of coronavirus sequences in fasta file
    orfList = [
    ]  # Stores sequences containing ORFs of coronavirus sequences in fasta file
    validNucs = ['A', 'C', 'G', 'T']
    myReader = FastAreader('Combined-ALL-SARS-CoV.fasta')
    for head, seq in myReader.readFasta(
    ):  # Using fastAreader to read in .fasta files
        headList.append(head)
        for i in seq:
            if i not in validNucs:  # Removing non-valid bases
                seq = seq.replace(i, "")
        orf = OrfFinder(
            seq, 300, True
        )  # Includes the largest ORF greater than 300 nucleotides within a stop codon
        geneOrfList = orf.getOrfs()
        geneSeq = []  # Stores ORF sequences
        for openFrame in geneOrfList:
            geneSeq.append(seq[openFrame[1] - 1:openFrame[2] - 1])
        orfList.append(geneSeq)
    # Calls methods to create SeqRecords and then .py file to print gene trees
    myPhylo = GeneTree()
    for i in range(
            0, 4,
            1):  # Loops to print the first four gene trees of every sequence
        records = myPhylo.geneSpecificRecord(
            orfList, headList,
            i)  # Creates list of SeqRecords that represent a sequence
        # alignments = myPhylo.fastaToPhylip(records)  # Makes a .phy file using a .fasta file
        print("GENE " + str(i + 1) + ":")
        # printTree = myPhylo.printGeneTree()  # Prints Gene Trees
    x = 0
    print(
        '\n\n============================================ K E Y ============================================\n'
    )
    for header in headList:  # Loops through headers to print key
        header = header.split(',')
        header = header[0]
        print("{} = {}".format(
            x, header))  # Prints each line containing the header
        x += 1
Beispiel #7
0
    def individualAnalysis(self, fastafile):
        """
        This method does all the printing and calculates codon usage
        """
        # print("Reading from {}...".format(fastafile))
        self.myReader = FastAreader(fastafile)

        length, nucParams = self.sequenceLength(self.myReader)
        print("sequence length = {:.2f} Mb".format(length), "\n")

        self.gc = self.gcContent(nucParams)
        print("GC content = {:.1f} %".format(self.gc), "\n")

        codonCount = nucParams.codonComposition()
        totalAAs = 0

        # Alphabatizes and loops through codonCount keys
        for codons in sorted(codonCount):
            c_count = codonCount[codons]
            totalAAs += c_count
            one_letter = nucParams.rnaCodonTable[codons]
            aminoCount = nucParams.aaComp[one_letter]
            #calculates percentage
            if c_count != 0:
                finalPercentage = (c_count/aminoCount) *100
            else:
                finalPercentage = (c_count/1) *100
            print('{} : {} {:5.1f}% ({:6d})'.format(codons, one_letter, finalPercentage, c_count))
        print("Total number of amino acids counted is "+str(totalAAs))
def main():
    '''
    main
    Execute all functions in order for proper output to stdout.
    '''

    fReader = FastAreader(sys.argv[1])
    for head, seq in fReader.readFasta():
        print("Generating powerset for: {}".format(head))
        findUnique(head, seq)
        #fa.getPowerSets(seq)

    for rna in sorted(findUnique.rnaSetlist, key=lambda rna: rna.header):
        uniqueAndEssential = rna.uniqueFinder()
        print(rna.header)
        print(rna.sequence)
        for index, seq in sorted(uniqueAndEssential.items(),
                                 key=lambda x: x[0]):
            print('.' * index, seq, sep='')
Beispiel #9
0
def main(myCommandLine=None):
    '''
    Implements the Usage exception handler that can be raised from anywhere in process.
    '''

    if myCommandLine is None:

        myCommandLine = CommandLine([ 'tass2.fa',
                                      'tass2ORFdata-ATG-100.txt',
                                      '--longestGene',
                                      '--start=ATG',
                                      '--minGene=100'])
    else :

        myCommandLine = CommandLine(myCommandLine)

    myCommandLine.args.inFile #has #the input file name
    outFile = myCommandLine.args.outFile  #the output file name
    myCommandLine.args.longestGene #is True if only the longest Gene is desired
    myCommandLine.args.start #is a list of start codons
    myCommandLine.args.minGene #is the minimum Gene length to include


    # Clear the file if created previously,
    # and open it
    orfReader = FastAreader(myCommandLine.args.inFile)
    open(myCommandLine.args.outFile, 'w').close()
    f = open(outFile, 'a')

    # loop through all of the sequences in a file
    # and calculate the respective ORF and write them
    for head, seq in orfReader.readFasta():
        nucParams = NucParams('')
        nucParams.addSequence(seq)
        nucParams.buildNucleotide()
        bases = list((''.join(nucParams.codons)))
        reverseBases = getReverseStrand(bases)

        finder = OrfFinder(bases,head,minGene=myCommandLine.args.minGene, longestGene=myCommandLine.args.longestGene, start=myCommandLine.args.start,revSeq=reverseBases)
        finder.findOrfs(f)
        finder.findRevOrfs(f)
        finder.writeFramesToFile(f)
Beispiel #10
0
def genomeAnalyzer():
    '''
    This genomeAnalyzer takes FastA files and prints out
    sorted relative codon information, sequence length
    information in Mb, and GC % content of the given genome.
    '''

    myReader = FastAreader(
        '/Users/stephaniegardner/Desktop/BME160/Lab04/HomoSapiensMitochondrion.fa'
    )  #instantiation of FasAreader class
    myNuc = NucParams()  #instantiation of NucParams class
    for head, seq in myReader.readFasta():  #usage of FastAreader class
        myNuc.addSequence(seq)  #usage of NucParams class
    '''Sequence length: takes total nucleotide counts and converts to Mb'''
    length = myNuc.nucCount() / 1000000
    print('sequence length = {:.2f}Mb'.format(length), "\n")
    '''GC Content: adds all G's and C's found in nucComp dictionary and divides by total nucleotides'''
    nucComp = myNuc.nucComposition()
    gc = nucComp['G'] + nucComp['C']
    gc = gc / myNuc.nucCount()
    print('GC Content = {:.2%}'.format(gc), "\n")

    #Individual Codon Analysis
    codonComp = myNuc.codonComposition()
    aaComp = myNuc.aaComposition()
    for codon, aa in sorted(myNuc.rnaCodonTable.items(),
                            key=lambda t: t[1] + t[0]):
        #^^crazy lambda function Logan helped me with. It dictates what is a codon and what is an amino acid in rnaCodonTable
        total = aaComp[
            aa]  #total number of the certain amino acid being iterated
        codonCount = codonComp[
            codon]  #total number of codons associated with amino being iterated

        if total != 0:
            val = (codonCount / total)
            #^^takes the amount of codon for a certain amino and divdes it by total times the amino occured in the sequence
        else:
            val = (codonCount / 1)
            #^^ if the amino acid was not found in the genome, its value is 0%

        print('{} : {} {:5.1f}% ({:6d})'.format(codon, aa, val * 100,
                                                codonCount))
Beispiel #11
0
def main():

    tRnaFinder = FastAreader('')

    for head,seq in tRnaFinder.readFasta():
        allPowerSets = tRNA(head,seq) #makes powerSet for each trna sequence

    sortTrnas = sorted(tRNA.tRNAobjects, key = lambda t:t.head) #sorts the tRNAs so they can be iterated through alphabetically

    for eachtRNA in tRNA.tRNAobjects:
        eachtRNA.buildUniques()
        eachtRNA.findEssentials()

    for everyTrna in sortTrnas:
        print(everyTrna.head)
        print(everyTrna.seq)
        sortedtRna = sorted(everyTrna.essentialSubs, key=lambda x:everyTrna.seq.find(x)) #Lisa also here
        for e in sortedtRna:
            position = everyTrna.seq.find(e)
            print('{}{}'.format('.' * position, e))
def main():
    '''
    Function main that uses imported classes NucParams and FastAreader to print out genome length, GC
    nucleotide content, and codon frequency for each codon. 
    '''

    myReader = FastAreader()  #instance of fastA file
    myNuc = NucParams()  #instance of NucParams
    for head, seq in myReader.readFasta():
        myNuc.addSequence(seq)

    #Prints sequence length.
    seqLength = myNuc.nucCount()
    mbSeqLength = seqLength / 1000000
    print("sequence length: {0:0.2f}Mb".format(mbSeqLength), "\n")

    #Printes GC content.
    nucComp = myNuc.nucComposition()
    gAndC = nucComp["G"] + nucComp["C"]
    gcContent = gAndC / myNuc.nucCount()
    print("GC content = {0:0.1%}%".format(gcContent), "\n")

    #Prints relative codon frequency and codon count for each codon.
    aaComp = myNuc.aaComposition()
    rnaCodonComp = myNuc.codonComposition()
    sortedRNATable = sorted(
        myNuc.rnaCodonTable.items(),
        key=lambda x: x[1])  #sorts aa by alphabetical order.

    for codon, aa in sortedRNATable:  #for each codon in rna codon table
        aaNumber = aaComp[myNuc.rnaCodonTable[
            codon]]  #denominator for relative codon frequency calculation.
        codonNumber = myNuc.rnaCodonCompDict[codon]  #numerator
        if aaNumber > 0:
            codonFreq = codonNumber / aaNumber  #relative codon frequency
        elif aaNumber == 0:
            codonFreq = 0
        print("{:s} : {:s} {:5.1f} ({:6d})".format(codon, aa, codonFreq * 100,
                                                   codonNumber),
              end=" ")
Beispiel #13
0
class genomeAnalyzer:
    # TODO you need docstrings here like I did for sequenceAnalysis
    def __init__(self, genome_path='testGenome.fa'):
        #TODO need docstrings
        self.genome = genome_path
        self.myReader = FastAreader(self.genome)
        self.nuc_params = NucParams()
        for head, seq in self.myReader.readFasta():
            self.nuc_params.addSequence(seq)

    def genomeAnalysis(self):
        """Compute and print sequence length, gc content and amino acid composition"""
        length = self.sequenceLength()
        # Alex, you should not change the length of the sequence until you format it because if you need to use the
        # function for something else you would have to change a bunch of stuff for it to work

        print("sequence length = {:.2f}Mb\n".format(length / 1000000.0))
        # same idea goes for gcContent. It should return the gcContent and if you want to convert to percentage, you can
        gc = self.gcContent()
        print('GC content = {:.1f}%\n'.format(gc * 100))

        codon_counts = self.nuc_params.codonComposition()
        aa_comp = self.nuc_params.aaComposition()

        # go through each amino acid
        for aa in sorted(aa_comp):
            # created another data structure aaTable in nuc_params to deal with going from aa to RNA codon
            for codon in sorted(self.nuc_params.aaRnaTable[aa]):
                codon_total = codon_counts[codon]
                aa_count = aa_comp[aa]
                # calculate relative codon usage for each codon and print
                if aa_count != 0:
                    total = (codon_total / aa_count) * 100
                else:
                    print("This happens")
                    total = (codon_total / 1) * 100

                print('{} : {} {:5.1f}% ({:6d})'.format(
                    codon, aa, total, codon_total))

    def gcContent(self):
        """Compute GC content of entire fasta file"""
        nuc_comp = self.nuc_params.nucComposition()
        gc = nuc_comp['G'] + nuc_comp['C']
        gc = gc / self.nuc_params.nucCount()
        return gc

    def sequenceLength(self):
        """Computes the sequence length of a given genome
        :return length, nucParams class object
        """
        return self.nuc_params.nucCount()
Beispiel #14
0
def main(inCL=None):
    """Reads in a fasta file and outputs the ORFs frame, start, stop, and length position on a output file."""
    if inCL is None:
        myCommandLine = CommandLine()
        if myCommandLine.args.longestGene:
            fastaFile = FastAreader()
            for header, sequence in fastaFile.readFasta():
                print(header)
                orfData = OrfFinder(sequence)
                orfData.findOrfs()
                orfData.findRevOrfs()
                filteredList = filter(
                    lambda orf: orf[3] > myCommandLine.args.minGene, orfData.
                    orfs)  # Filters out the ORFS depending on the minGene arg.
                for frame, start, stop, length in sorted(
                        filteredList, key=lambda orf: orf[3],
                        reverse=True):  # Sorts the list of ORFs by length.
                    print('{:+d} {:>5d}..{:>5d} {:>5d}'.format(
                        frame, start, stop, length))
    else:
        myCommandLine = CommandLine(inCL)
    print(myCommandLine.args)
Beispiel #15
0
    def individualAnalysis(self, fastafile):
        print('Delete line 20, reading from {}".format(fastafile)')
        myRead = FastAreader(fastafile)
        length, nucParams = self.sequenceLength(myRead)
        print("sequence length = {:1.2f} Mb".format(length))
        print("\n")

        gc = self.gcContent(nucParams)
        print("GC content = {:2.1}%".format(gc))
        print("\n")

        codonCount = nucParams.codonComposition()
        aminosComp = nucParams.aminoCompo()
        nucComp = nucParams.nucComposition
        for codon in sorted(codonCount):
            total = codonCount[codon]
            amino = nucParams.rnaCodonTable[codon]
            aminoCount = nucParams.aminoAcid.count(amino)

            finalTotal = total / aminoCount * 100
            print('{} : {} {:5.1f}% ({:6d})'.format(codon, amino, finalTotal,
                                                    total))
Beispiel #16
0
Notes:
The headers in the positiveModomics.fa have to be in the Henry format
The headers in teh unsorted.fa have to be in the gtRNAdb 11/18/2020 format
'''

sortedModomics = sys.argv[1]
modifier = sortedModomics.replace('-', '_').split('_')[2]

fastaFile = sys.argv[2]
base, ext = fastaFile.split('.')

newFileName_pos = f'{base}_{modifier}.{ext}'
newFileName_neg = f'{base}_neg.{ext}'

sortedHeaderSet = set()
headerSet = set()
for sortedHeader, sortedSequence in FastAreader(sortedModomics).readFasta():
    sortedHeaderSet.add(
        tuple(sortedHeader.replace('Ini', 'iMet').split('-')[2:4]))
    pass

with open(newFileName_pos, 'w') as posFile:
    with open(newFileName_neg, 'w') as negFile:
        for header, sequence in FastAreader(fastaFile).readFasta():
            thing = tuple(header.split('-')[1:3])
            headerSet.add(tuple(header.split('-')[1:3]))
            if thing in sortedHeaderSet:
                posFile.write(f'>{header}\n{sequence}\n')
            else:
                negFile.write(f'>{header}\n{sequence}\n')
Beispiel #17
0
sys.path.insert(2, "../../../CMPipelines/GenerateCM")
import os
from sequenceAnalysis import FastAreader
import pandas as pd

bigOlList = []

filepath = "./Potentially massive folder/speciesModDB/"
for org in os.listdir(filepath):
    #print(org)
    for mod in os.listdir(filepath + org):
        #print("\t"+mod)
        for pos in os.listdir(filepath + org + '/' + mod):
            #print("\t\t"+pos)
            fullFilePath = filepath + org + '/' + mod + '/' + pos
            for header, sequence in FastAreader(fullFilePath).readFasta():
                #print("\t\t\t"+header)
                genus, species, organismShort = org.split('_')[0], org.split(
                    '_')[1], org.split('_')[2]
                organismName = org.split('_')[0] + ' ' + org.split('_')[1]

                actualMod = mod.split('-')[len(mod.split('-')) - 1]
                actualPos = pos.split('_')[1]

                isotype = header.split('-')[2]
                isoacceptor = header.split('-')[3]
                isodecoder = header.split('-')[4]
                bigOlList.append([
                    genus, species, organismName, organismShort, actualMod,
                    actualPos, isotype, isoacceptor, isodecoder
                ])
Beispiel #18
0
import sys
sys.path.insert(1, "C:\\Users\\mattk\\Desktop\\python scripts\\todd stuff\\BioTools")
from sequenceAnalysis import FastAreader
import pandas as pd

'''
Given an aligned fasta (yeast) file
This program will write the output to a 
csv

'''

fileToDisplay = "sacCer3-trnaalign.fa"

tRNAdic = {}
for header, seq in FastAreader(fileToDisplay).readFasta():
	header = header.split(' ')[0]
	tRNAdic[header] = [position for position in seq]

#for yeast
columns = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 
			'11', '12', '13', '14', '15', '16', '17a', '17b', '18', 
			'19', '20a', '20b', '20c', '21', '22', '23', '24', '25', 
			'26', '27', '28', '29', '30', '31', '32', '33', '34', 
			'35', '36', '37', '38', '39', '40', '41', '42', '43', 
			'44', '45', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 
			'e8', 'e9', 'e10', 'e11', 'e12', 'e13', 'e14', 'e15', 
			'e16', 'e17', 'e18', 'e19', '46', '47', '48', '49', 
			'50', '51', '52', '53', '54', '55', '56', '57', '58', 
			'59', '60', '61', '62', '63', '64', '65a', '65b', '65c', 
			'65d', '65e', '70', '71', '72', '73', '74', '75', '76', 
Beispiel #19
0
class GraphicRep:
    '''
    reads and prepare files to be implemented in a matplotlib graphical representation
    of mtDNA. Can eventially be used for different circular DNA
    '''

    myReader = FastAreader('HomoSapiensMitochondrion.fa')
    for head, seq in myReader.readFasta():
        head = head
        seq = seq

    excelFile = 'mtDNACoding.xlsx' #names workbook
    funcLoc = pd.read_excel(excelFile) #loads speadsheet

    excelFile1 = 'MitoMap.xlsx' #names workbook
    mitoMap = pd.read_excel(excelFile1)

    locDict = {}
    mutDict = {}


    for j in range(0,38): #parses through function localtoin spread sheet
        locus = funcLoc.iloc[j,0] #creates dictionary with key - locus and value - list of start, stop and name of locus
        locDict.setdefault(locus,[])
        locDict[locus].append(funcLoc.iloc[j])

    locDictValues = list(locDict.values()) #[locus][start][stop][desc] // [0][0][2] --> stop pos of locus 1

    for j in range(0,330): #parses through mutation spread sheet and saves to dictionary
        locus = mitoMap.iloc[j,1] #dictionary key - locus and value - start, stop, Disease,Allele,RNA,Homoplasmy,Heteroplasmy,pathogenicity
        mutDict.setdefault(locus,[])
        mutDict[locus].append(mitoMap.iloc[j])

    mutDictValues = list(mutDict.values()) #saves vales to a list to be used later on



    def optionA(self,locus):
        '''
        OPTION A:
        input desired locus for viewing/analysis
            base mutations in a certain Locus
                calculates % of mutations at that Locus
            pathogenictiy of certain locus
            saves all info to a text file
        '''
        if locus is None:
            pass
        else:
            with open(locus+'MutationInfo', 'w') as f: #prepare file to save

                try:
                    length = ((len(list(GraphicRep.mutDict[locus])))/330) * 100 #average of mutations at that locus over total mutations
                    f.write("Percent of total mutations found at this locus are: {0:0.2f} % \n".format(length))

                    paths = 0
                    i = 0
                    x=float('nan')

                    for item in GraphicRep.mutDict[locus]: #adds up all the pathogenicities to be averaged out
                        i += 1
                        if math.isnan(item[7]) == False:
                            path = item[7]
                            paths += float(path) #online

                    pathAvg = paths/i #average = sum of all pathogenicity levels over the count of pathogenicities taken

                    if pathAvg == 0.00:
                        f.write('No pathogenicity data is available for this locus')
                    else:
                        f.write("Average Pathogenicity percentage for this locus is: {0:0.2f}%\n".format(pathAvg))
                except KeyError:
                    f.write('No mutation data is available for this locus')


    def optionB(self,low,high,disease):
        '''
        OPTION B:
            input pathogenicity interval [0-100]
                prints loci that have mutations with that pathogenicity
                prints specific base mutations with that pathogenicity
        '''
        if (low or high) is None:
            pass
        else:
            with open('mtDNAPathogInfo', 'w') as f:

                if disease is False:
                    f.write("Sorted based on position\nPathogenicity ({},{})- Codes for - Base Mutation".format(low,high))
                    for i in range(0,25): #for each locus
                        for j in range(0,44): # for all mutations in each locus
                            try:
                                val = float(GraphicRep.mutDictValues[i][j][7])
                                if low <= val <= high: #if value is inbetween low and high interval save to file
                                    f.write("{}% - {} - {}\n".format(GraphicRep.mutDictValues[i][j][7],GraphicRep.mutDictValues[i][j][4],GraphicRep.mutDictValues[i][j][3])) #[locus][entry number][column head]
                            except IndexError:
                                pass
                if disease is True:
                    f.write("Sorted based on position\nPathogenicity ({},{})- Codes for - Base Mutation - Linked Disease\n".format(low,high))
                    for i in range(0,25): #for each locus
                        for j in range(0,44): # for all mutations in each locus
                            try:
                                val = float(GraphicRep.mutDictValues[i][j][7])
                                if low <= val <= high: #if value is inbetween low and high interval save to file
                                    f.write("{}% - {} - {} - {}\n".format(GraphicRep.mutDictValues[i][j][7],GraphicRep.mutDictValues[i][j][4],GraphicRep.mutDictValues[i][j][3],GraphicRep.mutDictValues[i][j][2])) #[locus][entry number][column head]
                            except IndexError:
                                pass
                    f.write('\n\nLHON-Leber Hereditary Optic Neuropathy\nMM-Mitochondrial Myopathy\nAD-Alzeimers Disease\nLIMM-Lethal Infantile Mitochondrial Myopathy\nADPD-Alzeimers Disease and Parkinsonss Disease\nMMC-Maternal Myopathy and Cardiomyopathy\nNARP-Neurogenic muscle weakness, Ataxia, and Retinitis Pigmentosa\nFICP-Fatal Infantile Cardiomyopathy Plus, a MELAS-associated cardiomyopathy\nMELAS-Mitochondrial Encephalomyopathy, Lactic Acidosis, and Stroke-like episodes\nLDYT-Lebers hereditary optic neuropathy and DYsTonia\nMERRF-Myoclonic Epilepsy and Ragged Red Muscle Fibers\nMHCM-Maternally inherited Hypertrophic CardioMyopathy\nCPEO-Chronic Progressive External Ophthalmoplegia\nKSS-Kearns Sayre Syndrome\nDM-Diabetes Mellitus\nDMDF-Diabetes Mellitus + DeaFness\nCIPO-Chronic Intestinal Pseudoobstruction with myopathy and Ophthalmoplegia\nDEAF-Maternally inherited DEAFness or aminoglycoside-induced DEAFness\nPEM-Progressive encephalopathy\nSNHL-SensoriNeural Hearing Loss\n')

    # def optionC(self,locus,plasmy):
    #     '''
    #     OPTION C:
    #         input hetero or homoplasmy
    #             prints base mutations that correspond with...
    #
    #     '''
    #     if plasmy == True:
    #         with open('Hetero/HomoplasmyInfo', 'w') as f:
    #
    #
    #             for i in range(0,25): #for each locus
    #                 for j in range(0,44): # for all mutations in each locus
    #                     try:
    #                         if locus == GraphicRep.mutDictValues[i][j][1]:
    #                             homoplasmy = GraphicRep.mutDictValues[i][j][5]
    #                             heteroplasmy = GraphicRep.mutDictValues[i][j][6]
    #                             if homoplasmy == '+' and h**o == True:
    #                                 f.write('Homoplasmic Mutations: \n{} - {}\n').format(GraphicRep.mutDictValues[i][j][4],GraphicRep.mutDictValues[i][j][3]))
    #                             if heteroplasmy == '+' and hetero == True:
    #                                 f.write('Heteroplasmic Mutations: \n{}-{}\n').format(GraphicRep.mutDictValues[i][j][4],GraphicRep.mutDictValues[i][j][3]))
    #                     except IndexError:
    #                         pass
    #     else:
    #         pass

    def get_cmap(n, name='hsv'):
        '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct
        RGB color; the keyword argument name must be a standard mpl colormap name.
        https://stackoverflow.com/questions/14720331/how-to-generate-random-colors-in-matplotlib'''
        return plt.cm.get_cmap(name, n)


    def createMap(self):
        '''
        creates map of mitochondira DNA with color coded mutations and loci
        '''

        circumf = len(GraphicRep.seq) #circumference is equal to the length of the sequence
        baseLen = circumf/len(GraphicRep.seq) #each base is len of 1


        xs = circumf/len(GraphicRep.seq) #baseLen
        baseList = []
        baseList = np.arange(0,circumf,xs)  #populates base list with a list of base position
                                            #and their corresponding position on the circle figure


        cmap = GraphicRep.get_cmap(38) #creates a color map of 38 colors

        fig,ax = plt.subplots() #make figure
        ax = plt.subplot(polar=True) #make polar axis
        ax.set_aspect('equal') #set figure to print out with equal dimensions
        ax.set_xticklabels(['Origin','','','','']) #set origin to the top of the figure
        ax.grid() #clear theta grid from figure
        ax.set_rticks([]) #clear radius ticks from figure
        ax.set_theta_zero_location('N',offset = 0) #set the 0 location to the top of circle

        x=float('nan') #used in conditional (if x is not a number...)



        for i in range(0,25): #locus
            for j in range(0,44): #mutation (max number of mutations any of the loci have)
                try:
                    #makes each pathogenicity interval a special color
                    val = float(GraphicRep.mutDictValues[i][j][7])
                    if (0.0 <= val <= 20.0):
                        colors = 'purple'

                    elif (21.0 <= val <= 40.0):
                        colors = 'blue'

                    elif (41.0 <= val <= 60.0):
                        colors = 'green'

                    elif (61.0 <= val <= 80.0):
                        colors = 'yellow'

                    elif (81.0 <= val <= 99.0):
                        colors = 'orange'

                    elif (val == 100.0):
                        colors = 'red'

                    elif (math.isnan(val) == True):
                        colors = 'black'

                    mutPos = GraphicRep.mutDictValues[i][j][0]
                    theta=((np.pi*2)/len(baseList))*mutPos#calculate theta
                    #populate graph with mutation markers (|) that are roated the correspoding degree
                    ax.plot(theta,0,marker = (2,0,(theta*(180/np.pi))),color=colors,markersize=20,zorder = 6)
                except IndexError:
                    pass
                    #try except needed for locus that have fewer mutations than the max (44)

        for i in range(0,38): #for each locus
            locusName = GraphicRep.locDictValues[i][0][0]
            start = GraphicRep.locDictValues[i][0][1]
            stop = GraphicRep.locDictValues[i][0][2]
            thretaa = ((np.pi*2)/len(baseList))
            #plot gray markers where each locus starts
            ax.plot(thretaa*start,0,marker = (2,0,(thretaa*start)*(180/np.pi)),color='gray',markersize=60,zorder=6)
            for k in range(GraphicRep.locDictValues[i][0][1],GraphicRep.locDictValues[i][0][2]):
                theta = ((np.pi*2)/len(baseList))*k
                #plot all bases in a locus and switch colors at the end of the locus
                ax.plot(theta,0,marker = (2,0,(theta*(180/np.pi))),color=cmap(i),markersize=40,zorder=5)

        ax.annotate('D Loop',
        xy=(0, 0),  # theta, radius
        xytext=(.3, .9),    # fraction, fraction
        textcoords='figure fraction',
        zorder =8,#'offset points'   : Specify an offset (in points) from the xy value
        arrowprops=dict(width=.05,
        headwidth = 2,
        facecolor='black', shrink=.05),
        horizontalalignment='left',
        verticalalignment='bottom')

        y = .90
        for i in range(0,15): #for each locus
            locusNameL = GraphicRep.locDictValues[i][0][0]
            startL = GraphicRep.locDictValues[i][0][1]
            stopL = GraphicRep.locDictValues[i][0][2]
            thetaL = ((np.pi*2)/len(baseList))

            ax.annotate(locusNameL,
            xy=(thetaL*startL, .01),  # theta, radius
            xytext=(.1, y),    # fraction, fraction
            textcoords='figure fraction',
            zorder =8,#'offset points'   : Specify an offset (in points) from the xy value
            arrowprops=dict(width=.05,
            headwidth = 2,
            facecolor='black', shrink=.05),
            horizontalalignment='left',
            verticalalignment='bottom')
            y = y - .06

        x = .17
        for i in range(15,27):
            locusNameL = GraphicRep.locDictValues[i][0][0]
            startL = GraphicRep.locDictValues[i][0][1]
            stopL = GraphicRep.locDictValues[i][0][2]
            thetaL = ((np.pi*2)/len(baseList))

            ax.annotate(locusNameL,
            xy=(thetaL*startL, .01),  # theta, radius
            xytext=(x, .05),    # fraction, fraction
            textcoords='figure fraction',
            zorder =8,#'offset points'   : Specify an offset (in points) from the xy value
            arrowprops=dict(width=.05,
            headwidth = 2,
            facecolor='black', shrink=.05),
            horizontalalignment='left',
            verticalalignment='bottom')
            x = x + .06

        z = .14
        for i in range(27,38): #for each locus
            locusNameL = GraphicRep.locDictValues[i][0][0]
            startL = GraphicRep.locDictValues[i][0][1]
            stopL = GraphicRep.locDictValues[i][0][2]
            thetaL = ((np.pi*2)/len(baseList))

            ax.annotate(locusNameL,
            xy=(thetaL*startL, .01),  # theta, radius
            xytext=(.9, z),    # fraction, fraction
            textcoords='figure fraction',
            zorder =8,#'offset points'   : Specify an offset (in points) from the xy value
            arrowprops=dict(width=.05,
            headwidth = 2,
            facecolor='black', shrink=.05),
            horizontalalignment='left',
            verticalalignment='bottom')
            z = z + .07



        plt.savefig('/Users/stephaniegardner/Desktop/BME160/FinalProject/mtDNAMutations.png')





    def zoomPlot(self,locus):
        '''
        zooms in on a specified locus given in the command line and saves figure to file
        with appropriate locus name.
        First half of this method is the same as createMap.
        '''
        if locus is None:
            pass
        else:

            circumf = len(GraphicRep.seq)
            baseLen = circumf/len(GraphicRep.seq)

            xs = circumf/len(GraphicRep.seq) #baseLen
            baseList = []
            baseList = np.arange(0,circumf,xs)

            cmap = GraphicRep.get_cmap(38)

            figz = plt.figure()
            axz = figz.add_subplot(111, polar=True)
            axz.set_aspect('equal')
            axz.set_xticklabels(['Origin','','','',''])
            axz.grid()
            axz.set_rticks([])
            axz.set_theta_zero_location('N',offset = 0)


            x=float('nan')

            for i in range(0,25): #locus
                for j in range(0,44): #mutation
                    try:
                        val = float(GraphicRep.mutDictValues[i][j][7])
                        if (0.0 <= val <= 20.0):
                            colors = 'purple'
                        elif (21.0 <= val <= 40.0):
                            colors = 'blue'
                        elif (41.0 <= val <= 60.0):
                            colors = 'green'
                        elif (61.0 <= val <= 80.0):
                            colors = 'yellow'
                        elif (81.0 <= val <= 99.0):
                            colors = 'orange'
                        elif (val == 100.0):
                            colors = 'red'
                        elif (math.isnan(val) == True):
                            colors = 'black'
                        mutPos = GraphicRep.mutDictValues[i][j][0]
                        theta=((np.pi*2)/len(baseList))*mutPos
                        axz.plot(theta,0,marker = (2,0,(theta*(180/np.pi))),color=colors,markersize=200,zorder = 6,label=GraphicRep.mutDictValues[i][j][2])
                    except IndexError:
                        pass


            for i in range(0,38): #for each locus
                locusName = GraphicRep.locDictValues[i][0][0]
                start = GraphicRep.locDictValues[i][0][1]
                stop = GraphicRep.locDictValues[i][0][2]
                thretaa = ((np.pi*2)/len(baseList))
                axz.plot(thretaa*start,0,marker = (2,0,(thretaa*start)*(180/np.pi)),color='gray',markersize=150,zorder=6)
                for k in range(GraphicRep.locDictValues[i][0][1],GraphicRep.locDictValues[i][0][2]):
                    theta = ((np.pi*2)/len(baseList))*k
                    axz.plot(theta,0,marker = (2,0,(theta*(180/np.pi))),color=cmap(i),markersize=100,zorder=5)

            '''
            to zoom in on a certain locus, theta min and max must be reset to accomodate
            the locus for viewing.
            '''
            for p in range(0,38):
                if locus == GraphicRep.locDictValues[p][0][0]:
                    thetamin = (360/len(baseList))*GraphicRep.locDictValues[p][0][1]
                    thetamax = (360/len(baseList))*GraphicRep.locDictValues[p][0][2]
                    axz.set_thetamin(thetamin-5)
                    axz.set_thetamax(thetamax+5)

            plt.title(locus) #titles the figure with locus name

            plt.savefig('/Users/stephaniegardner/Desktop/BME160/FinalProject/'+locus+'zoomPlot.png')
Beispiel #20
0
def main():
    """
    Main function takes in file
    Parses arguments and runs substring sorter

    """

    #checks to see if file is entered, if not, print error and exit
    if len(sys.argv) != 2:
        print("Please enter a file after the program")
        sys.exit()
    #if file is entered, initialize it to infile
    if len(sys.argv) == 2:
        infile = open(sys.argv[1])

        #print(orfReader.readFastaStdIn(infile))
        #create lists for future storage
        allHeadsList = []
        subSequence = []
        seqSubseqDict = {}
        seqUniqueSubseqDict = {}

        substringMaker = SubstringGenerator()
        nucParam = NucParams('')
        orfReader = FastAreader()

        #takes infile, saves cleanseq and subseq to dictionary
        #makes list of all subseqs
        #makes list of all heads
        for head, seq in orfReader.readFastaStdIn(infile):
            cleanSeq = str(nucParam.stripSequence(seq))
            subSequence = list(substringMaker.get_substrings(cleanSeq))
            seqSubseqDict.update({cleanSeq: subSequence})
            allHeadsList.append(head)
            allSubSequencesList.append((subSequence))

        #make all else subseq list
        #subseq - allElse = uniques
        #save them to seqUnique dic
        i = 0
        #print(seqSubseqDict)
        for seq in seqSubseqDict:
            allElseSubseq = []
            for otherSeq in seqSubseqDict:
                if otherSeq != seq:
                    allElseSubseq.append((seqSubseqDict[otherSeq]))

            tempListSubseq = seqSubseqDict[seq]
            print(type(tempListSubseq))
            setTempListSubseq = set(tempListSubseq)
            setAllElse = set()
            for item in allElseSubseq:
                setAllElse.add(item)
            uniqueSubList = setTempListSubseq - setAllElse
            seqUniqueSubseqDict = {seq: uniqueSubList}
            FindUnique(seq, allHeadsList[i], seqSubseqDict[seq])
            i += 1
            FindUnique(allHeadsList[0], seq, seqSubseqDict[seq])

        tRNAList = []

        headLen = len(allHeadsList)
        baseIndex = 0
        thisHeadPos = 0
        thisBasesPos = 1
        #print("SeqSub len: " + str(len((seqSubseqDict))))
        for thisBase in seqSubseqDict:
            #tempSeq = allSubSequencesList.copy()
            #deletes the current sequence
            #del tempSeq[baseIndex]

            #tempUnique = list()

            #joins all the sequences into tempUnique
            #tempUnique.extend((tempSeq))
            tRNAList.append(
                FindUnique(allHeadsList[baseIndex].lstrip(), thisBase,
                           seqSubseqDict[thisBase]))
            baseIndex += 1
            if thisBasesPos + 1 < headLen and thisHeadPos < headLen:
                thisHeadPos += 1
                thisBasesPos += 1
        #print(len(tRNAList))
        for trna in tRNAList:
            trna.printSubstrings()
Beispiel #21
0
                combinations.append(goingBackward)

    return combinations


def makeDirectory():
    directoryName = fastafile.split('.')[0] + '_Sliced'
    path = os.path.join(whereToMake, directoryName)
    try:
        os.mkdir(path)
    except FileExistsError:
        pass

    return path


#important stuff
#   0         1              2             3         4
#'AA-stem', 'D-arm', 'Anticodon loop', 'Introns', 'T-arm'
directory = makeDirectory()
for desiredRegions in generateCombinations():
    newFasta = ''
    for header, sequence in FastAreader(fastafile).readFasta():
        windowMaker = CreateWindows(secondaryStructure,
                                    sequence=sequence,
                                    desiredRegions=desiredRegions)
        newFasta += f'>{header}\n{windowMaker.getWindowedSequence()}\n'

    fastaName = "_".join(desiredRegions) + '_Sliced.fa'
    with open(os.path.join(directory, fastaName), 'w') as file:
        file.write(newFasta)
Beispiel #22
0
EVALS = 'eValues.txt'

positiveFasta = "hasM1A.fa"

with open(NAMES) as nameFile:
    with open(EVALS) as evalFile:
        NAMES = []
        EVALS = []
        COLOR = []
        for name, evalue in zip(nameFile.readlines(), evalFile.readlines()):
            #print(name[0:name.find(' ')], -math.log10(float( evalue )))
            NAMES.append(name[0:name.find(' ')])
            EVALS.append(-math.log10(float(evalue)))

        headers = []
        for header, sequence in FastAreader(positiveFasta).readFasta():
            headers.append(header.split(' ')[0])

        for name in NAMES:
            inTest = False
            for header in headers:
                #print(name, header)
                if name.find(header) > -1:
                    inTest = True
                    break

            if inTest:
                COLOR.append('green')
            else:
                COLOR.append('black')
def main():
    """
    Function imports necessary classes from sequenceAnalysis.py module, uses
    FastAreader class to open and read the fasta files. the main() defines
    two object through the NucParams class, uses the methods in NucParams class
    to get the counts of the characters (nucleotide, codon, amino acid).
    """
    #imports FastAreader and NucParams classes from sequenceAnalyzer module
    from sequenceAnalysis import FastAreader, NucParams
    #read the 1st fasta file by FastAreader
    genome1 = FastAreader('vulgaris.fasta')
    #read the 2nd fasta file by FastAreader
    genome2 = FastAreader('Shewanella.fasta')
    #Makes an object called seq1 form NucParams class
    seq1 = NucParams('')
    #Makes an object called seq2 form NucParams class
    seq2 = NucParams('')
    # for head, sequence in the 1st genomeFasta file
    for headx, seqx in genome1.readFasta():
        #use addSequence method to update the dictionaries
        seq1.addSequence(seqx)
    
        #define GC content for the 1st genome
        GC1 = ((seq1.nucDic.get('G') + seq1.nucDic.get('C'))\
                                  
                                  /(seq1.nucCount())*100)

    print(seqx)
    # for head, sequence in the 1st genomeFasta file
    for heady, seqy in genome2.readFasta():
        #use addSequence method to update the dictionaries
        seq2.addSequence(seqy)
        #define GC content for the 2nd genome
        GC2 = ((seq2.nucDic.get('G') + seq2.nucDic.get('C'))\
                                  
                                  /(seq2.nucCount())*100)

### PRINT STATEMENTS ###
    #prints the sequence length of the two genomes
    print ("Genome1 = %s" %headx)
    print ("Genome2 = %s" %heady)
    print("sequence lengths: Genome1 = %.2f Mb  Genome2 = %.2f Mb"\
          %((seq1.nucCount()/(1000000)),(seq2.nucCount()/(1000000))))
    #blank line
    print("")
    #prints GC contents of the two genomes
    print ("GC contents:      Genome1 = %.1f%%    Genome2 = %.1f%%" \
           %(GC1, GC2))
    #blank line
    print("")
    #prints genome1 and genome2 for organization
    print("      Genome1                          Genome2")
    #blank line
    print ("")
    """
    readCounter = {}
    for nuc in seq1:
        read = seq[nuc:nuc+150]
        if read in seq2:
            readCounter[read] +=1
    print (readCounter)
    """
    #rnaCodonTable dictionary from the NucParams class is used for relative codon usage and amino acid composition 
    #sorts the rnaCodonTable and gets access to keys and values
    #do try:, exceptKeyError: format to address when we reach three stop codon with '-' value
    for keys, values in sorted(seq1.rnaCodonTable.items()) and sorted(seq2.rnaCodonTable.items()): #getting keys(codon)and values(aminoacid) from the rnaCodonTable
        try:

            xxx = keys # defining the key as the 3letter codon
            A = values # defining the values as one letter Amino acid
            D1 = seq1.codonDic[xxx] #for the 1st genome, getting the count of the codons from the codon Dictionary 
            aaCount1 = seq1.aaDic[A] #for the 1st genome, getting the amino acid count from the aminoacid dictionary
            F1 = ((D1/aaCount1)*1000) # for the 1st genome, frequency is codon count over amino acid count times 100
            D2 = seq2.codonDic[xxx] # for the 2nd genome, getting the count of the codons from the codon Dictionary 
            aaCount2 = seq2.aaDic[A]# for the 2nd genome, getting the amino acid count from the aminoacid dictionary
            F2 = ((D2/aaCount2)*1000) #relative frequencyfor the 2nd genome
        except KeyError:
            continue
        
        #prints two genomes codon usage right next to each other.    
        print ("%s : %s %5.1f (%6d)       %s : %s %5.1f (%6d)" % (xxx, A, F1, D1, xxx, A, F2, D2))