Esempio n. 1
0
def real_dna():

    dna = load_seq("./data/X73525.fa")
    threshold = longest_ORF_noncoding(dna, 500)

    aminoacidsequence = gene_finder(dna, threshold)
    return aminoacidsequence
Esempio n. 2
0
def gene_finder(dna):
    """ Returns the amino acid sequences that are likely coded by the specified dna

        dna: a DNA sequence
        returns: a list of all amino acid sequences coded by the sequence dna.
    """
    a=[]  #creating an empty list
    threshold = longest_ORF_noncoding(dna,1500) #Assign Value to threshold
    Long_Orfs = len(longest_ORF(dna)) #assign value to Long_Orfs
    if Long_Orfs>threshold): #Compare values
        a.append(coding_strand_to_AA(dna)) #add to the list
    dna = load_seq("./data/X73525.fa") #obtaining genes
    print gene_finder(dna) #showing the list of Amino Acids
Esempio n. 3
0
def gene_finder(dna):
    #Makes sure it all comes together
    from load import load_seq
    dna = load_seq("/home/tolu/GeneFinder/data/X73525.fa")
    threshold = longest_ORF_noncoding(dna, 1500)
    output = []
    protein = ''
    orfs = find_all_ORFs_both_strands(dna)
    i = 0
    for i in range(len(orfs)):
        if (len(orfs[i]) >= threshold):
            protein = coding_strand_to_AA(orfs[i])
            output.append(protein)
    return output
Esempio n. 4
0
        returns: a list of all amino acid sequences coded by the sequence dna.
        
        >>> gene_finder("ATGCGAATGTAGCATCAAA")
        ['MRM', 'MLHSH']
    """
    print 'Finding threshold...'
    threshold = len(longest_ORF_noncoding(dna, 1500))
    print threshold
    
    all_ORFs = find_all_ORFs_both_strands(dna) 
    returns = []
    i=0

    print 'Entering while loop... '
    while i < len(all_ORFs):
        if len(all_ORFs[i]) >= threshold:
    	    returns.append(coding_strand_to_AA(all_ORFs[i]))
    	    print 'Added sequence'
        i+=1

    print str(len(returns)) + 'sequences added: \n'
    print returns


if __name__ == "__main__":
    import doctest
    #doctest.testmod()
    #doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose=True)

gene_finder(load_seq("./data/X73525.fa"))
Esempio n. 5
0
    return proteins


def gene_finder(dna):
    """ Returns the amino acid sequences that are likely coded by the specified dna

        dna: a DNA sequence
        returns: a list of all amino acid sequences coded by the sequence dna.
    """
    len(dna)
    num_trials = 1500                                                                   
    allorfs = find_all_ORFs_both_strands(dna)                                                 
    longest = len(longest_ORF(dna))
    amirandom = longest_ORF_noncoding(dna, num_trials)
    finallist =  ORFsinOrder(dna , amirandom)
    translated = []
    for i in range(0, len(finallist)-1):                                                #This loop translates all of the resulting potential genes into their final format. 

        translated.append(coding_strand_to_AA(finallist[i]))
    return translated





if __name__ == "__main__": 
    import doctest
    doctest.testmod()

mygenes = gene_finder(load_seq("./data/X73525.fa"))                         #mygenes is the final variable that contains all the potential genes above a threshold. 
Esempio n. 6
0
    # Sets i to weep dna, j to sweep codons and sees where it sees where i:i+3 is in Codons
    # and returns the AA sequence for that codon.

    return ''.join([ aa[j] for i in range(0,len(dna),3) for j in range(len(codons)) if dna[i:i+3] in codons[j]])


def gene_finder(dna):
    """ Returns the amino acid sequences that are likely coded by the specified dna
        
        dna: a DNA sequence
        returns: a list of all amino acid sequences coded by the sequence dna.
    """
    # Finds the ORF length for threshold
    ORF_length = longest_ORF_noncoding( dna ,threshold)

    # End ORFs
    Real_ORFs = []

    # Finds all the ORFs and sets i to them
    for i in find_all_ORFs_both_strands(dna):
        if len(i) > ORF_length:
             Real_ORFs.append(coding_strand_to_AA(i))

    return len(Real_ORFs),Real_ORFs


if __name__ == "__main__":
    import doctest
    doctest.testmod()
    print gene_finder( load_seq("./data/X73525.fa"), 1500)
from gene_finder import *
from load import load_seq
salmonella_dna = load_seq("./data/X73525.fa")

def two_lists_contain_same_elements(list1, list2):
    if len(list1) != len(list2):
        return False
    else: 
        for list_item in list1:
            if list_item in list2:
                continue
            else:
                return False
        return True


def coding_strand_to_AA_unit_tests():
    """ Unit tests for the coding_strand_to_AA function """

    # DNA input strands 
    dna_input1 = "ACTGCCCC"
    dna_input2 = "AGCTGAGGGTGTTTTGGA"
    dna_input3 = "CAGGCTTGCGGCTTCTTAA"

    # Expected output amino acid strands
    e_output1 = "TA"
    e_output2 = "S|GCFG"
    e_output3 = "QACGFL"

    # Actual output amino acid strands 
    a_output1 = coding_strand_to_AA(dna_input1)
Esempio n. 8
0
def main():   
    dna = load_seq("./data/X73525.fa")
    threshold = longest_ORF_noncoding(dna, 1500)
    candidate_genes = gene_finder(dna, threshold)
Esempio n. 9
0

def gene_finder(dna):
    """ Returns the amino acid sequences coded by all genes that have an ORF
        larger than the specified threshold.
        
        dna: a DNA sequence
        threshold: the minimum length of the ORF for it to be considered a valid
                   gene.
        returns: a list of all amino acid sequences whose ORFs meet the minimum
                 length specified.
    """
    # TODO: implement this

    # determine threshold
    threshold = longest_ORF_noncoding(dna, 1500)

    # call coding_strand_to_AA on each of the ORFs
    return [
        coding_strand_to_AA(ORF) for ORF in (find_all_ORFs_both_strands(dna))
        if (len(ORF) > threshold)
    ]


if __name__ == "__main__":
    import doctest
    doctest.testmod()

    dna = load_seq("./data/X73525.fa")

    print gene_finder(dna)
Esempio n. 10
0
	"""

    acids = []
    for i in range(0, len(dna) - 2, 3):
        acids.append(aa_table[dna[i:i + 3]])
    return ''.join(acids)


def gene_finder(dna):
    """ Returns the amino acid sequences coded by all genes that have an ORF
		larger than the specified threshold.
		
		dna: a DNA sequence
		threshold: the minimum length of the ORF for it to be considered a valid
				   gene.
		returns: a list of all amino acid sequences whose ORFs meet the minimum
				 length specified.
	"""
    acids = []
    threshold = longest_ORF_noncoding(dna, 1500)
    for i in find_all_ORFs_both_strands(dna):
        if len(i) >= threshold:
            acids.append(coding_strand_to_AA(i))
    return acids


if __name__ == "__main__":
    import doctest
    doctest.testmod()
    print(gene_finder(load_seq('./data/X73525.fa')))
Esempio n. 11
0
def run_gene_finder():
    """Loads gene and returns long_ORFS"""
    dna = load_seq('./data/X73525.fa')
    amino_acids = gene_finder(dna)
    return amino_acids
Esempio n. 12
0
def get_threshold():
    """Returns a conservative threshold to use to get ORFS. Prints 789"""
    dna = load_seq('./data/X73525.fa')
    return longest_ORF_noncoding(dna,1500)
Esempio n. 13
0
def gene_finder_salmonella():
    from load import load_seq
    dna = load_seq("./data/X73525.fa")
    threshold = len(longest_ORF_noncoding(dna,1500))
    salmonella_aa = gene_finder(dna,threshold)
    return salmonella_aa
Esempio n. 14
0
def get_reverse_complement(dna):
    """ Computes the reverse complementary sequence of DNA for the specfied DNA
        sequence
        dna: a DNA sequence represented as a string
        returns: the reverse complementary DNA sequence represented as a string
    >>> get_reverse_complement("ATGCCCGCTTT")
    'AAAGCGGGCAT'
    >>> get_reverse_complement("CCGCGTTCA")
    'TGAACGCGG'
    >>> get_reverse_complement("ATCG")
    'CGAT'
    """
    # TODO: implement this
    reversed_dna = dna[::-1]
    result = ' '
    for letter in reversed_dna:
        result = result + get_complement(letter)
        return result

    def divide_to_codons(dna):
        """Takes a DNA sequence and outputs a list of string triplets(codons) that makes up the sequence
           Last element might be incomplete codon with less then three letters
        >>> divide_to_codons("ATGTGAA")
        ['ATG', 'TGA', 'A']
        >>> divide_to_codons("ATGTGA")
        ['ATG', 'TGA']
        >>> divide_to_codons("ATGTGAAA")
        ['ATG', 'TGA', 'AA']
        """
        index = 0
        result = []
        while index < len(dna):
            result.append(dna[index:index+3])
            index = index + 3
        return result

    def rest_of_ORF(dna):
        """ Takes a DNA sequence that is assumed to begin with a start
            codon and returns the sequence up to but not including the
            first in frame stop codon.  If there is no in frame stop codon,
            returns the whole string.
            dna: a DNA sequence
            returns: the open reading frame represented as a string
        >>> rest_of_ORF("ATGTGAA")
        'ATG'
        >>> rest_of_ORF("ATGAGATAGG")
        'ATGAGA'
        >>> rest_of_ORF("ATG")
        'ATG'
        >>> rest_of_ORF("AT")
        'AT'
        >>> rest_of_ORF("ATGASDASDWASDWADASDSAD")
        'ATGASDASDWASDWADASDSAD'
        >>> rest_of_ORF("ATGTGTTAAATGAAAAAATAGAA")
        'ATGTGT'
        """
        stop_codons = ['TAG', 'TAA, TGA']
        #list of codons from which the dna is composed of
        codons = divide_to_codons(dna)
        result = ""
        index = 0
        while index + 1 < len(codons):
            #If next codons isn't a stop codon, add it to string and iterate
            if codons[index + 1] not in stop_codons:
                result = result + codons[index]
                index = index + 1
            else:
                #Add codon before stop codon
                result = result + codons[index]
                return result
        return dna

    def find_all_ORFs_oneframe(dna):
        """
        Finds all non-nested open reading frames in the given DNA
        sequence and returns them as a list.  This function should
        only find ORFs that are in the default frame of the sequence
        (i.e. they start on indices that are multiples of 3).
        By non-nested we mean that if an ORF occurs entirely within
        another ORF, it should not be included in the returned list of ORFs.
        dna: a DNA sequence
        returns: a list of non-nested ORFs
        >>> find_all_ORFs_oneframe("ATGCATGAATGTAGATAGATGTGCCC")
        ['ATGCATGAATGTAGA', 'ATGTGCCC']
        >>> find_all_ORFs_oneframe("ATGTGAA")
        ['ATG']
        >>> find_all_ORFs_oneframe('ASDASDAWSDSD')
        []
        >>> find_all_ORFs_oneframe('TATATGCATGAATGTAGATAGATGTGCTAAATAATAATGTTTTAAATT')
        ['ATGCATGAATGTAGA', 'ATGTGC', 'ATGTTT']
        """
        index = 0
        orf_list = []
        while index < len(dna):
            if dna[index:index+3] == 'ATG':
                #appended ORF
                orf = rest_of_ORF(dna[index:])
                orf_list.append(orf)
                index = index + len(orf)
            else:
                index = index + 3
        return orf_list

    def find_all_ORFs(dna):
        """
        Finds all non-nested open reading frames in the given DNA sequence in
        all 3 possible frames and returns them as a list.  By non-nested we
        mean that if an ORF occurs entirely within another ORF and they are
        both in the same frame, it should not be included in the returned list
        of ORFs.
        dna: a DNA sequence
        returns: a list of non-nested ORFs
        This unit testing would be enough because there isn't any special exceptions that needs to be tested. Also, this case tests this function's
        ability to grab orf from three different possible reading frames.
        >>> find_all_ORFs("ATGCATGAATGTAG")
        ['ATGCATGAATGTAG', 'ATGAATGTAG', 'ATG']
        """
        #orf list from all frames
        orf_list = []
        #zero offset frame
        orf_list = orf_list + find_all_ORFs_oneframe(dna)
        #first offset frame
        orf_list = orf_list + find_all_ORFs_oneframe(dna[1:])
        #second offset frame
        orf_list = orf_list + find_all_ORFs_oneframe(dna[2:])
        return orf_list

    def find_all_ORFs_both_strands(dna):
        """
        Finds all non-nested open reading frames in the given DNA sequence on both
        strands.
        dna: a DNA sequence
        returns: a list of non-nested ORFs
        >>> find_all_ORFs_both_strands("ATGCGAATGTAGCATCAAA")
        ['ATGCGAATG', 'ATGCTACATTCGCAT']
        """
        reverse = get_reverse_complement(dna)
        #finds orfs in both direction
        orf_list = find_all_ORFs(dna) + find_all_ORFs(reverse_complement)
        return orf_list

    def longest_ORF(dna):
        """
        Finds the longest ORF on both strands of the specified DNA and returns it
        as a string
        >>> longest_ORF("ATGCGAATGTAGCATCAAA")
        'ATGCTACATTCGCAT'
        """
        longest_length = 0
        orfs = find_all_ORFs_both_strands(dna)
        for orf in orfs:
            if len(orf) > longest_length:
                longest_orf = orf
                longest_length = len(orf)
            return longest_orf

    def longest_ORF_noncoding(dna, num_trials):
        """
        Computes the maximum length of the longest ORF over num_trials shuffles
        of the specfied DNA sequence
        dna: a DNA sequence
        num_trials: the number of random shuffles
        returns: the maximum length longest ORF
        """
        x = 0
        longest = 0
        while x < num_trials:
            shuffled_dna = shuffle_string(dna)
            longest_orf_length = len(longest_ORF(shuffled_dna))
            if longest_orf_length > longest:
                longest = longest_orf_length
            x = x + 1
        return longest

        def coding_strand_to_AA(dna):
            """ Computes the Protein encoded by a sequence of DNA.  This function
                does not check for start and stop codons (it assumes that the input
                DNA sequence represents an protein coding region).
                dna: a DNA sequence represented as a string
                returns: a string containing the sequence of amino acids encoded by the
                         the input DNA fragment
                >>> coding_strand_to_AA("ATGCGA")
                'MR'
                >>> coding_strand_to_AA("ATGCCCGCTTT")
                'MPA'
                >>> coding_strand_to_AA("TTTATCATGTTAGTTA")
                'FIMLV'
            """
            codons = divide_to_codons(dna)
            amino_acid = ''
            for codon in codons:
                if len(codon) == 3:
                    amino_acid = amino_acid + aa_table[codon]
            return amino_acid

        def gene_finder(dna):
            """ Returns the amino acid sequences that are likely coded by the specified dna
                dna: a DNA sequence
                returns: a list of all amino acid sequences coded by the sequence dna.
            """
            threshold = longest_ORF_noncoding(dna, 1500)
            all_orfs = find_all_ORFs_both_strands(dna)
            amnio_acids = []
            for orf in all_orfs:
                if len(orf) > threshold :
                    amino_acids.append(coding_strang_to_AA(orf))
            return amino_acids

    if __name__ == "__main__":
        import doctest
        doctest.testmod(verbose = True)
        doctest.run_docstring_examples(coding_strand_to_AA, globals(), verbose = True)
        dna_seq = load_seq('data/X73525.fa')
        print (gene_finder(dna_seq))
Esempio n. 15
0
    s = 0
    while s <= len(dna) - 3:
        m.append(aa_table[dna[s:s + 3]])
        s = s + 3
    k = ''
    k = k.join(m)
    return k


def gene_finder(dna):
    """ Returns the amino acid sequences that are likely coded by the specified dna

        dna: a DNA sequence
        returns: a list of all amino acid sequences coded by the sequence dna.
    """
    ac = []
    threshold = longest_ORF_noncoding(dna, 1500)
    dna_long = find_all_ORFs_both_strands(dna)
    for i in dna_long:
        if len(i) > threshold:
            ac.append(coding_strand_to_AA(i))

    return ac


if __name__ == "__main__":
    import doctest

    print(gene_finder(dna=load_seq("./data/X73525.fa")))
    doctest.testmod(verbose=True)
Esempio n. 16
0
        >>> coding_strand_to_AA("TTTATCATGTTAGTTA")
        'FIMLV'
    """
    codons = divide_to_codons(dna)
    amino_acid = ''
    for codon in codons:
        if len(codon) == 3:
            amino_acid += aa_table[codon]
    return amino_acid

def gene_finder(dna):
    """ Returns the amino acid sequences that are likely coded by the specified dna

        dna: a DNA sequence
        returns: a list of all amino acid sequences coded by the sequence dna.
    """
    threshold = longest_ORF_noncoding(dna, 1500)
    all_orfs = find_all_ORFs_both_strands(dna)
    amino_acids = []
    for orf in all_orfs:
        if len(orf) > threshold:
            amino_acids.append(coding_strand_to_AA(orf))
    return amino_acids

if __name__ == "__main__":
    import doctest
    doctest.testmod()
    #doctest.run_docstring_examples(coding_strand_to_AA, globals())
    dna_seq = load_seq('data/X73525.fa')
    print gene_finder(dna_seq)
Esempio n. 17
0
def gene_finder(dna):
    """ Returns the amino acid sequences that are likely coded by the specified dna
        
        dna: a DNA sequence
        returns: a list of all amino acid sequences coded by the sequence dna.

        No appropriate doctests can be written for this function - the result can and
        most likely will vary with each call, because longest_ORF_noncoding is 
        inherently not consistent in its return values, due to the random nature
        of the shuffle.
    """
    max_len = len(longest_ORF_noncoding(dna, 1500))
    return sorted(
        [
            coding_strand_to_AA(orf)
            for orf in find_all_ORFs_both_strands(dna) if len(orf) >= max_len
        ],
        key=len,
        reverse=True
    )  #Returns reverse length sorted list of all translated genes that are longer than threshold


if __name__ == "__main__":
    import doctest
    doctest.testmod()

    result = gene_finder(load_seq("./data/X73525.fa"))
    print "Here is the result:"
    for translated_protein in result:
        print translated_protein
Esempio n. 18
0
    for k in range(0,len(dna_three)):
        threes = dna_three[k]
        if len(threes)==3:
            amino_acid_1 = aa_table[threes]
            amino_acid += amino_acid.join(amino_acid_1)
        k = k + 1
    return amino_acid

def gene_finder(dna):
    """ Returns the amino acid sequences coded by all genes that have an ORF
        larger than the specified threshold.

        dna: a DNA sequence
        returns: a list of all amino acid sequences coded by the sequence dna.
    """
    # TODO: implement this
    threshold = longest_ORF_noncoding(dna, 1500)

    both_strand_orfs_unthreshold = find_all_ORFs_both_strands(dna)
    both_strand_orfs_threshold = []

    for orfs in both_strand_orfs_unthreshold:
        if len(orfs) > threshold:
            both_strand_orfs_threshold.append(orfs)

    final_amino_conversion = map(coding_strand_to_AA, both_strand_orfs_threshold)
    return final_amino_conversion

dna = load_seq('./data/X73525.fa')
print gene_finder(dna)
Esempio n. 19
0
def salmonella_gene_finder():
    dna = load_seq("./data/X73525.fa")
    genes = gene_finder(dna)
    for gene in genes:
        print gene
Esempio n. 20
0
    for i in range(0, len(dna), 3):
        codon = dna[i:i+3]
        if len(codon) == 3:
            aa_sequence += aa_table[codon]

    return aa_sequence

def gene_finder(dna):
    """ Returns the amino acid sequences that are likely coded by the specified dna

        dna: a DNA sequence
        returns: a list of all amino acid sequences coded by the sequence dna.
    """

    aa_list = []
    threshold = longest_ORF_noncoding(dna, 1500)
    print threshold
    dna_list = find_all_ORFs_both_strands(dna)
    for i in range(0, len(dna_list)):
        if len(dna_list[i]) > threshold:
            aa_list.append(coding_strand_to_AA(dna_list[i]))

    return aa_list

if __name__ == "__main__":
    import doctest
    #doctest.testmod()
    #doctest.run_docstring_examples(coding_strand_to_AA, globals())
    from load import load_seq
    dna = load_seq("./data/X73525.fa")
    print gene_finder(dna)
Esempio n. 21
0
        shuffle(DNA)
    compare = []
    for index in range(len(longest_frame)):         #cycles through num_trials number of indexes comparing to find longest ORF
        if len(longest_frame[index]) > len(compare):
            compare = longest_frame[index]
        else:
            compare = compare
    return len(compare)
        
#print longest_ORF_noncoding('ATGCGAATGTAGCATCAAA', 30)

def gene_finder(dna, threshold):
    """ Returns the amino acid sequences coded by all genes that have an ORF
        larger than the specified threshold.
        
        dna: a DNA sequence
        threshold: the minimum length of the ORF for it to be considered a valid
                   gene.
        returns: a list of all amino acid sequences whose ORFs meet the minimum
                 length specified.
    """
    amino_acids = []
    all_frames = find_all_ORFs_both_strands(dna)
    for index in range(len(all_frames)):                            #if the length of ORF is above the threshold, adds it to list
        if len(all_frames[index]) > threshold:
            amino_acids.append(coding_strand_to_AA(all_frames[index])) #finds amino acids coresponding to codons from above list
    return amino_acids
if __name__ == '__main__':
    dna = load_seq('./data/X73525.fa') 
    gene_finder(dna,600)    
    print(gene_finder(dna, 600))