def predict_reference(weights_path, gap_id, fasta_path, embedding_dim, latent_dim, min_seed_length, beam_length, base_path=None): importer = SequenceImporter() reverser = SequenceReverser() sequences = importer.import_fasta([fasta_path])[0:2] model = GapPredictModel(min_seed_length=min_seed_length, stateful=False, embedding_dim=embedding_dim, latent_dim=latent_dim, with_gpu=True) model.load_weights(weights_path + 'my_model_weights.h5') terminal_directory_character = UTILS.get_terminal_directory_character() first_directory = "beam_search" + terminal_directory_character + "regenerate_seq" + terminal_directory_character for i in range(len(sequences)): sequence = sequences[i] #TODO: messy since we assume that i will always be [0, 1] if i == 0: flank = "left_flank" static_path = first_directory + flank + terminal_directory_character elif i == 1: flank = "right_flank" static_path = first_directory + flank + terminal_directory_character forward_seed = sequence[:min_seed_length] prediction_length = len(sequence) - min_seed_length reverse_complement_seed = reverser.reverse_complement( sequence)[:min_seed_length] directionality = "forward" flank_id = gap_id + "_" + flank + "_" + directionality predict(model, forward_seed, prediction_length, beam_length, flank_id, base_path=base_path, directory=static_path + directionality) directionality = "reverse_complement" flank_id = gap_id + "_" + flank + "_" + directionality predict(model, reverse_complement_seed, prediction_length, beam_length, flank_id, base_path=base_path, directory=static_path + directionality)
def setUp(self): self.reads = SequenceImporter().import_fastq(['data/sample.fastq']) self.error_reads = SequenceImporter().import_fastq( ['data/sample_with_errors.fastq']) self.batch_size = 4 self.generator = DataGenerator(self.reads, 26, batch_size=self.batch_size, log_samples=False) self.error_generator = DataGenerator(self.error_reads, 26, batch_size=self.batch_size, log_samples=False, spacing=10) self.encoder = KmerLabelEncoder()
def predict_arbitrary_length(weights_path, gap_id, fasta_path, embedding_dim, latent_dim, length_to_predict, base_path=None): importer = SequenceImporter() reverser = SequenceReverser() sequences = importer.import_fasta([fasta_path]) left_flank = sequences[0] right_flank = reverser.reverse_complement(sequences[1]) model = GapPredictModel(min_seed_length=None, stateful=False, batch_size=1, embedding_dim=embedding_dim, latent_dim=latent_dim, with_gpu=True) model.load_weights(weights_path + 'my_model_weights.h5') forward_predict = predict_gaps(model, left_flank, length_to_predict, base_path=base_path, directory="predict_gap/forward") reverse_predict = predict_gaps(model, right_flank, length_to_predict, base_path=base_path, directory="predict_gap/reverse_complement") writer = DataWriter(root_directory=base_path) postfix = "_LD_" + str(latent_dim) writer.save_complements(forward_predict, reverse_predict, gap_id, postfix=postfix, fasta_ref=fasta_path)
def predict_arbitrary_length(weights_path, gap_id, fasta_path, embedding_dim, latent_dim, length_to_predict, beam_length, base_path=None): importer = SequenceImporter() reverser = SequenceReverser() sequences = importer.import_fasta([fasta_path]) left_flank = sequences[0] right_flank = reverser.reverse_complement(sequences[1]) model = GapPredictModel(min_seed_length=None, stateful=False, embedding_dim=embedding_dim, latent_dim=latent_dim, with_gpu=True) model.load_weights(weights_path + 'my_model_weights.h5') terminal_directory_character = UTILS.get_terminal_directory_character() directory = "beam_search" + terminal_directory_character + "predict_gap" + terminal_directory_character flank_id = gap_id + "_" + "left_forward" predict(model, left_flank, length_to_predict, beam_length, flank_id, base_path=base_path, directory=directory + "forward") flank_id = gap_id + "_" + "right_reverse_complement" predict(model, right_flank, length_to_predict, beam_length, flank_id, base_path=base_path, directory=directory + "reverse_complement")
def train_model(base_directory, min_seed_length, reference, reads, epochs, batch_sizes, rnn_dims, embedding_dims, replicates, patience, seed_range_upper, log_samples=False): include_reverse_complement = True importer = SequenceImporter() reverse_complementer = SequenceReverser() reads = importer.import_fastq([reads], include_reverse_complement) reference_sequence = importer.import_fasta([reference]) reference_sequences = [ reference_sequence[0], reverse_complementer.reverse_complement(reference_sequence[0]), reference_sequence[1], reverse_complementer.reverse_complement(reference_sequence[1]) ] lengths = np.array(list(map(lambda x: len(x), reference_sequences))) with_gpu = True log_samples = log_samples log_training = False early_stopping = True legend = ['Mean Sequence'] terminal_directory_character = dir_utils.get_terminal_directory_character() for batch_size in batch_sizes: for embedding_dim in embedding_dims: for latent_dim in rnn_dims: for i in range(replicates): weights_path = dir_utils.clean_directory_string( base_directory) inner_directory = "BS_" + str(batch_size) + "_ED_" + str(embedding_dim) + "_LD_" + str(latent_dim) \ + "_E_" + str(epochs) + "_R_" + str(i) writer_path = weights_path + inner_directory + terminal_directory_character dir_utils.mkdir(weights_path) dir_utils.mkdir(writer_path) model = GapPredictModel( min_seed_length=min_seed_length, stateful=False, batch_size=batch_size, epochs=epochs, embedding_dim=embedding_dim, latent_dim=latent_dim, with_gpu=with_gpu, log_samples=log_samples, reference_sequences=reference_sequences, log_training=log_training, early_stopping=early_stopping, patience=patience, seed_range_upper=seed_range_upper, base_path=base_directory) start_time = time.time() history = model.fit(reads) model.save_weights(weights_path + 'my_model_weights.h5') end_time = time.time() print("Fitting took " + str(end_time - start_time) + "s") accuracy, loss = model.validation_history() if log_training: training_accuracy = model.training_history() else: training_accuracy = history.history['acc'] training_loss = history.history['loss'] if early_stopping: best_epoch = model.get_best_epoch() _plot_training_validation(epochs, accuracy, loss, training_accuracy, training_loss, lengths, writer_path, legend=legend, best_epoch=best_epoch)
class TestSequenceImporter(TestCase): def setUp(self): self.importer = SequenceImporter() self.converter = BaseQualityConverter() self.reverser = SequenceReverser() def test_import_fastq(self): sequences = self.importer.import_fastq([ '../data/mini_read_for_test.fastq', '../data/mini_read_for_test_2.fastq.gz' ]) expected_sequences = np.array([ "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA", "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA", "GTACAGCTCAGAAAGCGGAGTTGCGCCAAGATTGTTAACCAGCGCAATCACCCGATCGCCAGACTGGAGCGGTTGTTTGGTTTGTTGTTCTTCCTGCC", "TTACCTTGTGGAGCGACATCCAGAGGCACTTCACCGCTTGCCAGCGGCTTACCATCCAGCGCCACCATCCAGTGCAGGAGCTCGTTATCGCTATGACG" ]) np.testing.assert_array_equal(expected_sequences, sequences) def test_import_fasta(self): sequences = self.importer.import_fasta([ '../data/mini_read_for_test.fasta', '../data/mini_read_for_test_2.fasta.gz' ]) expected_sequences = np.array([ "CTGTTGAGCTATACTGTGGGAAGTCTGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGAATTTCTCCGATAATTTATTGTTTATCATGATCGCCAGCGTATCGTTTATCTATTTGCTG", "AATTTCTCCGATAATTTATTGTTTATCATGATCGCCAGCGTATCGTTTATCTATTTGCTGCTGTTGAGCTATACTGTGGGAAGTCTGCTTGGCCCGTCATTTACCGCTATGCTAATGCAG", "ATGATCGCCAGCGTATCGTTTATCTATTTGCTGAATTTCTCCGATAATTTATTGTTTATCGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGCTGTTGAGCTATACTGTGGGAAGTCT", "ATGATCGCCAGCGTATCGTTTATCTATTTGCTGAATTTCTCCGATAATTTATTGTTTATCGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGCTGTTGAGCTATACTGTGGGAAGTCT", "CTGTTGAGCTATACTGTGGGAAGTCTGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGAATTTCTCCGATAATTTATTGTTTATCATGATCGCCAGCGTATCGTTTATCTATTTGCTG", "AATTTCTCCGATAATTTATTGTTTATCATGATCGCCAGCGTATCGTTTATCTATTTGCTGCTGTTGAGCTATACTGTGGGAAGTCTGCTTGGCCCGTCATTTACCGCTATGCTAATGCAG", "ATGATCGCCAGCGTATCGTTTATCTATTTGCTGAATTTCTCCGATAATTTATTGTTTATCGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGCTGTTGAGCTATACTGTGGGAAGTCT" ]) np.testing.assert_array_equal(expected_sequences, sequences) def test_import_fastq_with_reverse_complement(self): sequences = self.importer.import_fastq([ '../data/mini_read_for_test.fastq', '../data/mini_read_for_test_2.fastq.gz' ], True) expected_sequences = np.array([ "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA", self.reverser.reverse_complement( "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA" ), "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA", self.reverser.reverse_complement( "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA" ), "GTACAGCTCAGAAAGCGGAGTTGCGCCAAGATTGTTAACCAGCGCAATCACCCGATCGCCAGACTGGAGCGGTTGTTTGGTTTGTTGTTCTTCCTGCC", self.reverser.reverse_complement( "GTACAGCTCAGAAAGCGGAGTTGCGCCAAGATTGTTAACCAGCGCAATCACCCGATCGCCAGACTGGAGCGGTTGTTTGGTTTGTTGTTCTTCCTGCC" ), "TTACCTTGTGGAGCGACATCCAGAGGCACTTCACCGCTTGCCAGCGGCTTACCATCCAGCGCCACCATCCAGTGCAGGAGCTCGTTATCGCTATGACG", self.reverser.reverse_complement( "TTACCTTGTGGAGCGACATCCAGAGGCACTTCACCGCTTGCCAGCGGCTTACCATCCAGCGCCACCATCCAGTGCAGGAGCTCGTTATCGCTATGACG" ) ]) np.testing.assert_array_equal(expected_sequences, sequences) def test_no_files_fastq(self): sequences = self.importer.import_fastq([], True) np.testing.assert_array_equal(sequences, []) def test_bogus_file_fastq(self): try: self.importer.import_fastq(['data/blahblahblah.fastq']) self.fail() except FileNotFoundError as e: pass except Exception as e: self.fail() def test_no_files_fasta(self): sequences = self.importer.import_fasta([]) np.testing.assert_array_equal(sequences, []) def test_bogus_file_fasta(self): try: self.importer.import_fasta(['data/blahblahblah.fasta']) self.fail() except FileNotFoundError as e: pass except Exception as e: self.fail()
def setUp(self): self.importer = SequenceImporter() self.converter = BaseQualityConverter() self.reverser = SequenceReverser()
def predict_reference(weights_path, gap_id, fasta_path, embedding_dim, latent_dim, min_seed_length, plots=False, base_path=None): importer = SequenceImporter() reverser = SequenceReverser() sequences = importer.import_fasta([fasta_path])[0:2] stateful_model = GapPredictModel(min_seed_length=min_seed_length, stateful=True, batch_size=1, embedding_dim=embedding_dim, latent_dim=latent_dim, with_gpu=True) stateless_model = GapPredictModel(min_seed_length=min_seed_length, stateful=False, batch_size=1, embedding_dim=embedding_dim, latent_dim=latent_dim, with_gpu=True) stateful_model.load_weights(weights_path + 'my_model_weights.h5') stateless_model.load_weights(weights_path + 'my_model_weights.h5') terminal_directory_character = UTILS.get_terminal_directory_character() first_directory = "regenerate_seq" + terminal_directory_character forward_left_flank = None rc_left_flank = None forward_right_flank = None rc_right_flank = None for i in range(len(sequences)): sequence = sequences[i] #TODO: messy since we assume that i will always be [0, 1] if i == 0: static_path = first_directory + "left_flank" + terminal_directory_character elif i == 1: static_path = first_directory + "right_flank" + terminal_directory_character forward_predict = predict_flanks(stateful_model, stateless_model, min_seed_length, sequence, base_path=base_path, directory=static_path + "forward") reverse_predict = predict_flanks(stateful_model, stateless_model, min_seed_length, reverser.reverse_complement(sequence), base_path=base_path, directory=static_path + "reverse_complement") if i == 0: forward_left_flank = forward_predict rc_left_flank = reverse_predict elif i == 1: forward_right_flank = forward_predict rc_right_flank = reverse_predict writer = DataWriter(root_directory=base_path, directory=first_directory) writer.write_flank_predict_fasta(forward_left_flank, rc_left_flank, forward_right_flank, rc_right_flank, latent_dim, gap_id)