def main(): # Read in the RNA sequences from a file specified by user input filename = input("Please enter the input file name: ") rnainfo = readfasta(filename) # Prepare to re-write the RNA sequences to an output file specified by user input outfilename = input("Please enter the output file name: ") handle = open(outfilename, mode="w") # Iterate through each RNA sequence in the input file for i in range(len(rnainfo)): # Specify gene that is being evaluated handle.write("Gene " + str(i+1) + ": " + rnainfo[i][2] + "\n\n") # Translate the RNA Sequence to its corresponding single-letter amino acid sequence # Write information to the output file translatedseq = translate(rnainfo[i][2]) handle.write("Protein Sequence " + str(i+1) + ": " + translatedseq + "\n\n") # Scan the single-letter amino acid sequence for transmembrane helices # Write results to the output file findTMD(translatedseq, handle) # Close file handle.close()
def bootstrap(): sequences = readfasta("mt_homo_dna.fasta") list_original_sequences = [] for sequence in range( 0, len(sequences) ): #convert (list of lists) to (list of strings), makes logic easier later list_original_sequences.append(sequences[sequence][1]) print("Original Sequences") for sequence in range(0, len(list_original_sequences)): print(list_original_sequences[sequence]) i = 0 #counter for number of letters appended j = len( min(list_original_sequences, key=len) ) #num of letters needed to be appended, shortest string in orig. seqs k = len(list_original_sequences) - 1 #number of sequences new_sequences = [ "" ] * k #generates k number of empty strings, stores in list while i != j: random_num = random.randint(0, j - 1) #select random column for sequence in range(0, k): #iterate thru total num of sequences new_sequences[sequence] += list_original_sequences[sequence][ random_num] #appends each sequence, each iter. i += 1 print("New Sequences") for sequence in range(0, len(new_sequences)): print(new_sequences[sequence])
def main(): sequences = readfasta.readfasta(sys.arv[1]) # sequences = readfasta.readfasta("mt_homo_dna.fasta") table = d.get_k2p_table(sequences) global help_table help_table = table find_smallest(table)
def main(): print( "*****\nBioinformatics - Assignment 2 - Group 2\n*****\n" ) # scan in all fasta files in the "genes" directory os.chdir( os.getcwd() + "/genes/" ) for file in glob.glob( "*.fasta" ): print( file ) genes = readfasta( file ) for gene in genes: print( gene[1][:60] )
def main(): print("*****\nBioinformatics - Assignment 2 - Group 2\n*****\n") # scan in all fasta files in the "genes" directory os.chdir(os.getcwd() + "/genes/") file = glob.glob("*.fasta")[0] # read all the genes from the fasta file print(file) genes = readfasta(file) original_tree = generate_tree(genes) print(original_tree[0]) clade_count_dict = {} # count the clades of the original tree and add them as keys build_clade_count_dict(original_tree[0], clade_count_dict) BOOTSTRAP_TIMES = 20 multi_aligned_sequences = \ progressive_alignment(genes, original_tree[0], original_tree[1]) multi_aligned_sequences = reorder_alignments(multi_aligned_sequences) for sequence in multi_aligned_sequences: print(sequence[0][:120]) #count each clade in bootstrap trees matching a clade from original tree for i in range(0, BOOTSTRAP_TIMES): bootstrapped_genes = generate_bootstrap_genes(multi_aligned_sequences) this_tree = generate_boots_tree(bootstrapped_genes) print("Bootstrap Tree ", i) print(this_tree) clade_search(this_tree, clade_count_dict) # return a dict containing the clades as keys mapped to their confidence clade_confidences = \ calculate_confidences( clade_count_dict, BOOTSTRAP_TIMES ) for clade, confidence in clade_confidences.items(): print("clade: ", clade) print("confidence: ", confidence)
def get_k2p_table(sequence_list): thread_count = 2 size = len(sequence_list) table = dict() processes = mp.Pool(processes=thread_count) process_pool = [] for seq in sequence_list: table[seq[0]] = dict() for i in range(size): seq1 = sequence_list[i] for j in range(i, size): seq2 = sequence_list[j] process_pool.append(processes.apply_async(k2p_multiprocess, (seq1, seq2,))) processes.close() processes.join() for p in process_pool: result = p.get() s1name = result[0] s2name = result[1] distance = result[2] table[s1name][s2name] = distance table[s2name][s1name] = distance return table if __name__ == '__main__': sq1 = r.readfasta("sample.fasta.txt")[0][1] sq2 = r.readfasta("sample.fasta.txt")[1][1] k2p(sq1, sq2)
''' Zoe Moore 9/15/2019 A program that reads RNA sequences from a text file and translates them to their corresponding amino acid sequences. ''' from readfasta import readfasta from RNATranslate import translate # Read in the RNA sequences from a file specified by user input filename = input("Please enter the input file name: ") rnainfo = readfasta(filename) # Prepare to re-write the RNA sequences to an output file specified by user input outfilename = input("Please enter the rna output file name: ") handle = open(outfilename, mode="w") # Separate out three RNA sequences and write them to separate lines of a .txt file seqone = rnainfo[0][2] handle.write(seqone + "\n\n") seqtwo = rnainfo[1][2] handle.write(seqtwo + "\n\n") seqthree = rnainfo[2][2] handle.write(seqthree + "\n\n") handle.close() # Translate RNA sequences to their single-letter amino acid sequences aaseqone = translate(seqone)
def main(): print( "*****\nBioinformatics - Assignment 1 - Group 3\n*****\n" ) # for each fsa tested, a 0 will be added to the report card # if random/our function picked the wrong reading frame, and # a 1 will be added if it picks the right reading frame randoms_report_card = [] our_report_card = [] random_was_right = 0 we_were_right = 0 number_of_files_scanned = 0 # the actual reading frame of all fsa we input is always 2+ # which is represented by the index 1 ACTUAL_READING_FRAME = 1 # scan in all fsa files in the "genes" directory os.chdir( os.getcwd() + "/genes/" ) for file in glob.glob( "*.fsa" ): print( file ) gene = readfasta( file )[0][1] rfs = get_all_reading_frames( gene ) # if random picks the correct reading frame if randint( 0, 5 ) == ACTUAL_READING_FRAME: random_was_right = random_was_right + 1 randoms_report_card.append(1) else: randoms_report_card.append(0) # if our algorithm picks the correct reading frame if find_best_reading_frame( rfs ) == ACTUAL_READING_FRAME: we_were_right = we_were_right + 1 our_report_card.append(1) else: our_report_card.append(0) number_of_files_scanned = number_of_files_scanned + 1 print( "*****\n" ) print( number_of_files_scanned, "genes scanned" ) percent_we_were_right = we_were_right/number_of_files_scanned * 100 percent_rand_was_right = random_was_right/number_of_files_scanned * 100 percent_we_were_right = round( percent_we_were_right, 3 ) percent_rand_was_right = round( percent_rand_was_right, 3 ) print( "Our code was right ", percent_we_were_right, "% of the time" ) print( "Random was right ", percent_rand_was_right, "% of the time\n" ) print( "Our report card: ", our_report_card ) print( "Random's report card: ", randoms_report_card ) ourStats = stats.ttest_ind(randoms_report_card,our_report_card) print("The p-value is " + str(ourStats.pvalue)) if ourStats.pvalue < 0.05: print("Our program did statistically significantly better at" + " picking the correct RF than a randomly picked RF.") else: print("Our program did NOT do statistically significantly better at" + " picking the correct RF than a randomly picked RF.")
help="The path to the FASTA file " + "containing all sequences to be compared.") parser.add_argument("-p", "--pairwiseComparisonFile", type=str, required=False, default="", help="The path to the alignment file " + "containing all pairwise alignments and their scores. " + "If this is not present, we will generate all pairwise " + "comparisons (may take a very long time!!).") parser.add_argument("-b", "--bootstrap", type=int, default=1000, help="The number of bootstrapping iterations to perform. " + "Reasonable values range from one to ten thousand. " "These are computationally cheap (seconds per round)." ) args = parser.parse_args() allTrees = {} # Simply counts the occurrence of each tree canonicalFasta = readfasta(args.fastaFile) canonicalComparisons = args.pairwiseComparisonFile # Run all pairwise comparisons if necessary # This will give us the "canonical comparison" file if (canonicalComparisons == "") or (not os.path.exists(canonicalComparisons)): print("Pairwise comparison filing missing or absent. Running all comparisons...") if canonicalComparisons == "": fastaName = (args.fastaFile).rstrip(".fasta") canonicalComparisons = fastaName + ".txt" writePairwiseAlignmentFile( canonicalFasta, canonicalComparisons ) # Build the "canonical" phylogenetic tree and add it to the collection of all trees canonicalFinalNode = doNeighborJoining(canonicalComparisons, canonicalFasta) allTrees[canonicalFinalNode.getTreeFile()] = 1
''' Runs a number of tests on the classes and functions in our phylogeny generator Tyler Young ''' from team_2_optimal_alignment_sensitive import * from team_2_neighbor_joining import * from readfasta import readfasta from team_2_bootstrapping import getBootstrappedSequences import time fastaData = readfasta("mtDNA.fasta") distMatrix = constructMatrixFromFile("mtDNA_alignments_with_gorilla_original.txt", fastaData) for i in range(len(distMatrix)): print(fastaData[i][1],distMatrix[i]) for i in range(len(distMatrix)): print(distMatrix[i]) finalNode = getNeighborJoiningPhylogeny(getNeighborJoiningSequences(fastaData), distMatrix) print(finalNode.getTreeFile()) genomes = [ "AATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCG", "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGG", "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGG", "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGG", "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGG", "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG", "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGG"] print("Running optimal alignment . . .") alignment = OptimalAlignment( genomes[0], genomes[1] )