Esempio n. 1
0
def main():
    # Read in the RNA sequences from a file specified by user input
    filename = input("Please enter the input file name: ")
    rnainfo = readfasta(filename)

    # Prepare to re-write the RNA sequences to an output file specified by user input
    outfilename = input("Please enter the output file name: ")
    handle = open(outfilename, mode="w")

    # Iterate through each RNA sequence in the input file
    for i in range(len(rnainfo)):
        # Specify gene that is being evaluated
        handle.write("Gene " + str(i+1) + ": " + rnainfo[i][2] + "\n\n")
        
        # Translate the RNA Sequence to its corresponding single-letter amino acid sequence
        # Write information to the output file
        translatedseq = translate(rnainfo[i][2])
        handle.write("Protein Sequence " + str(i+1) + ": " + translatedseq + "\n\n")
        
        # Scan the single-letter amino acid sequence for transmembrane helices
        # Write results to the output file
        findTMD(translatedseq, handle)

    # Close file
    handle.close()
Esempio n. 2
0
def bootstrap():
    sequences = readfasta("mt_homo_dna.fasta")

    list_original_sequences = []
    for sequence in range(
            0, len(sequences)
    ):  #convert (list of lists) to (list of strings), makes logic easier later
        list_original_sequences.append(sequences[sequence][1])

    print("Original Sequences")

    for sequence in range(0, len(list_original_sequences)):
        print(list_original_sequences[sequence])

    i = 0  #counter for number of letters appended
    j = len(
        min(list_original_sequences, key=len)
    )  #num of letters needed to be appended, shortest string in orig. seqs
    k = len(list_original_sequences) - 1  #number of sequences
    new_sequences = [
        ""
    ] * k  #generates k number of empty strings, stores in list

    while i != j:
        random_num = random.randint(0, j - 1)  #select random column

        for sequence in range(0, k):  #iterate thru total num of sequences
            new_sequences[sequence] += list_original_sequences[sequence][
                random_num]  #appends each sequence, each iter.
        i += 1

    print("New Sequences")

    for sequence in range(0, len(new_sequences)):
        print(new_sequences[sequence])
Esempio n. 3
0
def main():
    sequences = readfasta.readfasta(sys.arv[1])
    # sequences = readfasta.readfasta("mt_homo_dna.fasta")
    table = d.get_k2p_table(sequences)
    global help_table
    help_table = table

    find_smallest(table)
Esempio n. 4
0
def main():
    print( "*****\nBioinformatics - Assignment 2 - Group 2\n*****\n" )

    # scan in all fasta files in the "genes" directory
    os.chdir( os.getcwd() + "/genes/" )
    for file in glob.glob( "*.fasta" ):
        print( file )

        genes = readfasta( file )
        for gene in genes:
            print( gene[1][:60] )
Esempio n. 5
0
def main():
    print("*****\nBioinformatics - Assignment 2 - Group 2\n*****\n")

    # scan in all fasta files in the "genes" directory
    os.chdir(os.getcwd() + "/genes/")
    file = glob.glob("*.fasta")[0]

    # read all the genes from the fasta file
    print(file)
    genes = readfasta(file)

    original_tree = generate_tree(genes)

    print(original_tree[0])

    clade_count_dict = {}

    # count the clades of the original tree and add them as keys
    build_clade_count_dict(original_tree[0], clade_count_dict)

    BOOTSTRAP_TIMES = 20

    multi_aligned_sequences = \
    progressive_alignment(genes, original_tree[0], original_tree[1])

    multi_aligned_sequences = reorder_alignments(multi_aligned_sequences)

    for sequence in multi_aligned_sequences:
        print(sequence[0][:120])

    #count each clade in bootstrap trees matching a clade from original tree
    for i in range(0, BOOTSTRAP_TIMES):
        bootstrapped_genes = generate_bootstrap_genes(multi_aligned_sequences)

        this_tree = generate_boots_tree(bootstrapped_genes)

        print("Bootstrap Tree ", i)
        print(this_tree)

        clade_search(this_tree, clade_count_dict)

    # return a dict containing the clades as keys mapped to their confidence
    clade_confidences = \
    calculate_confidences( clade_count_dict, BOOTSTRAP_TIMES )

    for clade, confidence in clade_confidences.items():
        print("clade: ", clade)
        print("confidence: ", confidence)
Esempio n. 6
0
def get_k2p_table(sequence_list):
    thread_count = 2
    size = len(sequence_list)
    table = dict()
    processes = mp.Pool(processes=thread_count)
    process_pool = []
    for seq in sequence_list:
        table[seq[0]] = dict()
    for i in range(size):
        seq1 = sequence_list[i]
        for j in range(i, size):
            seq2 = sequence_list[j]
            process_pool.append(processes.apply_async(k2p_multiprocess, (seq1, seq2,)))
    processes.close()
    processes.join()
    for p in process_pool:
        result = p.get()
        s1name = result[0]
        s2name = result[1]
        distance = result[2]
        table[s1name][s2name] = distance
        table[s2name][s1name] = distance

    return table


if __name__ == '__main__':
    sq1 = r.readfasta("sample.fasta.txt")[0][1]
    sq2 = r.readfasta("sample.fasta.txt")[1][1]
    k2p(sq1, sq2)
Esempio n. 7
0
'''
Zoe Moore 9/15/2019

A program that reads RNA sequences from a text file and translates
them to their corresponding amino acid sequences.
'''
from readfasta import readfasta
from RNATranslate import translate

# Read in the RNA sequences from a file specified by user input
filename = input("Please enter the input file name: ")
rnainfo = readfasta(filename)

# Prepare to re-write the RNA sequences to an output file specified by user input
outfilename = input("Please enter the rna output file name: ")
handle = open(outfilename, mode="w")

# Separate out three RNA sequences and write them to separate lines of a .txt file
seqone = rnainfo[0][2]
handle.write(seqone + "\n\n")

seqtwo = rnainfo[1][2]
handle.write(seqtwo + "\n\n")

seqthree = rnainfo[2][2]
handle.write(seqthree + "\n\n")

handle.close()

# Translate RNA sequences to their single-letter amino acid sequences
aaseqone = translate(seqone)
Esempio n. 8
0
def main():
    print( "*****\nBioinformatics - Assignment 1 - Group 3\n*****\n" )

    # for each fsa tested, a 0 will be added to the report card
    # if random/our function picked the wrong reading frame, and 
    # a 1 will be added if it picks the right reading frame
    randoms_report_card = []
    our_report_card = []

    random_was_right = 0
    we_were_right = 0
    number_of_files_scanned = 0

    # the actual reading frame of all fsa we input is always 2+
    # which is represented by the index 1
    ACTUAL_READING_FRAME = 1

    # scan in all fsa files in the "genes" directory
    os.chdir( os.getcwd() + "/genes/" )
    for file in glob.glob( "*.fsa" ):

        print( file )
        gene = readfasta( file )[0][1]
        rfs = get_all_reading_frames( gene )

        # if random picks the correct reading frame
        if randint( 0, 5 ) == ACTUAL_READING_FRAME:
            random_was_right = random_was_right + 1
            randoms_report_card.append(1)
        else:
            randoms_report_card.append(0)

        # if our algorithm picks the correct reading frame
        if find_best_reading_frame( rfs ) == ACTUAL_READING_FRAME:
            we_were_right = we_were_right + 1
            our_report_card.append(1)
        else:
            our_report_card.append(0)

        number_of_files_scanned = number_of_files_scanned + 1

    print( "*****\n" )
    print( number_of_files_scanned, "genes scanned" )

    percent_we_were_right = we_were_right/number_of_files_scanned * 100
    percent_rand_was_right = random_was_right/number_of_files_scanned * 100

    percent_we_were_right = round( percent_we_were_right, 3 )
    percent_rand_was_right = round( percent_rand_was_right, 3 )

    print( "Our code was right ", percent_we_were_right, "% of the time" )
    print( "Random was right ", percent_rand_was_right, "% of the time\n" )

    print( "Our report card: ", our_report_card )
    print( "Random's report card: ", randoms_report_card )

    ourStats = stats.ttest_ind(randoms_report_card,our_report_card)
    print("The p-value is " + str(ourStats.pvalue))

    if ourStats.pvalue < 0.05:
        print("Our program did statistically significantly better at"
              + " picking the correct RF than a randomly picked RF.")
    else:
        print("Our program did NOT do statistically significantly better at"
              + " picking the correct RF than a randomly picked RF.")
                    help="The path to the FASTA file "
                         + "containing all sequences to be compared.")
parser.add_argument("-p", "--pairwiseComparisonFile", type=str,
                    required=False, default="",
                    help="The path to the alignment file "
                         + "containing all pairwise alignments and their scores. "
                         + "If this is not present, we will generate all pairwise "
                         + "comparisons (may take a very long time!!).")
parser.add_argument("-b", "--bootstrap", type=int, default=1000,
                    help="The number of bootstrapping iterations to perform. "
                         + "Reasonable values range from one to ten thousand. "
                           "These are computationally cheap (seconds per round)." )
args = parser.parse_args()

allTrees = {} # Simply counts the occurrence of each tree
canonicalFasta = readfasta(args.fastaFile)
canonicalComparisons = args.pairwiseComparisonFile

# Run all pairwise comparisons if necessary
# This will give us the "canonical comparison" file
if (canonicalComparisons == "") or (not os.path.exists(canonicalComparisons)):
    print("Pairwise comparison filing missing or absent. Running all comparisons...")
    if canonicalComparisons == "":
        fastaName = (args.fastaFile).rstrip(".fasta")
        canonicalComparisons = fastaName + ".txt"
    writePairwiseAlignmentFile( canonicalFasta, canonicalComparisons )

# Build the "canonical" phylogenetic tree and add it to the collection of all trees
canonicalFinalNode = doNeighborJoining(canonicalComparisons, canonicalFasta)
allTrees[canonicalFinalNode.getTreeFile()] = 1
'''
Runs a number of tests on the classes and functions in our phylogeny generator

Tyler Young
'''

from team_2_optimal_alignment_sensitive import *
from team_2_neighbor_joining import *
from readfasta import readfasta
from team_2_bootstrapping import getBootstrappedSequences
import time

fastaData = readfasta("mtDNA.fasta")
distMatrix = constructMatrixFromFile("mtDNA_alignments_with_gorilla_original.txt", fastaData)
for i in range(len(distMatrix)):
    print(fastaData[i][1],distMatrix[i])
for i in range(len(distMatrix)):
    print(distMatrix[i])

finalNode = getNeighborJoiningPhylogeny(getNeighborJoiningSequences(fastaData), distMatrix)
print(finalNode.getTreeFile())

genomes = [ "AATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCG",
            "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGG",
            "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGG",
            "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGG",
            "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGG",
            "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG",
            "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGG"]
print("Running optimal alignment . . .")
alignment = OptimalAlignment( genomes[0], genomes[1] )