Beispiel #1
0
def predict_reference(weights_path,
                      gap_id,
                      fasta_path,
                      embedding_dim,
                      latent_dim,
                      min_seed_length,
                      beam_length,
                      base_path=None):
    importer = SequenceImporter()
    reverser = SequenceReverser()

    sequences = importer.import_fasta([fasta_path])[0:2]

    model = GapPredictModel(min_seed_length=min_seed_length,
                            stateful=False,
                            embedding_dim=embedding_dim,
                            latent_dim=latent_dim,
                            with_gpu=True)

    model.load_weights(weights_path + 'my_model_weights.h5')

    terminal_directory_character = UTILS.get_terminal_directory_character()
    first_directory = "beam_search" + terminal_directory_character + "regenerate_seq" + terminal_directory_character
    for i in range(len(sequences)):
        sequence = sequences[i]

        #TODO: messy since we assume that i will always be [0, 1]
        if i == 0:
            flank = "left_flank"
            static_path = first_directory + flank + terminal_directory_character
        elif i == 1:
            flank = "right_flank"
            static_path = first_directory + flank + terminal_directory_character

        forward_seed = sequence[:min_seed_length]
        prediction_length = len(sequence) - min_seed_length
        reverse_complement_seed = reverser.reverse_complement(
            sequence)[:min_seed_length]
        directionality = "forward"
        flank_id = gap_id + "_" + flank + "_" + directionality
        predict(model,
                forward_seed,
                prediction_length,
                beam_length,
                flank_id,
                base_path=base_path,
                directory=static_path + directionality)
        directionality = "reverse_complement"
        flank_id = gap_id + "_" + flank + "_" + directionality
        predict(model,
                reverse_complement_seed,
                prediction_length,
                beam_length,
                flank_id,
                base_path=base_path,
                directory=static_path + directionality)
Beispiel #2
0
def predict_arbitrary_length(weights_path,
                             gap_id,
                             fasta_path,
                             embedding_dim,
                             latent_dim,
                             length_to_predict,
                             base_path=None):
    importer = SequenceImporter()
    reverser = SequenceReverser()

    sequences = importer.import_fasta([fasta_path])

    left_flank = sequences[0]
    right_flank = reverser.reverse_complement(sequences[1])

    model = GapPredictModel(min_seed_length=None,
                            stateful=False,
                            batch_size=1,
                            embedding_dim=embedding_dim,
                            latent_dim=latent_dim,
                            with_gpu=True)

    model.load_weights(weights_path + 'my_model_weights.h5')

    forward_predict = predict_gaps(model,
                                   left_flank,
                                   length_to_predict,
                                   base_path=base_path,
                                   directory="predict_gap/forward")
    reverse_predict = predict_gaps(model,
                                   right_flank,
                                   length_to_predict,
                                   base_path=base_path,
                                   directory="predict_gap/reverse_complement")

    writer = DataWriter(root_directory=base_path)
    postfix = "_LD_" + str(latent_dim)
    writer.save_complements(forward_predict,
                            reverse_predict,
                            gap_id,
                            postfix=postfix,
                            fasta_ref=fasta_path)
Beispiel #3
0
def predict_arbitrary_length(weights_path,
                             gap_id,
                             fasta_path,
                             embedding_dim,
                             latent_dim,
                             length_to_predict,
                             beam_length,
                             base_path=None):
    importer = SequenceImporter()
    reverser = SequenceReverser()

    sequences = importer.import_fasta([fasta_path])

    left_flank = sequences[0]
    right_flank = reverser.reverse_complement(sequences[1])

    model = GapPredictModel(min_seed_length=None,
                            stateful=False,
                            embedding_dim=embedding_dim,
                            latent_dim=latent_dim,
                            with_gpu=True)

    model.load_weights(weights_path + 'my_model_weights.h5')
    terminal_directory_character = UTILS.get_terminal_directory_character()
    directory = "beam_search" + terminal_directory_character + "predict_gap" + terminal_directory_character

    flank_id = gap_id + "_" + "left_forward"
    predict(model,
            left_flank,
            length_to_predict,
            beam_length,
            flank_id,
            base_path=base_path,
            directory=directory + "forward")
    flank_id = gap_id + "_" + "right_reverse_complement"
    predict(model,
            right_flank,
            length_to_predict,
            beam_length,
            flank_id,
            base_path=base_path,
            directory=directory + "reverse_complement")
Beispiel #4
0
def train_model(base_directory,
                min_seed_length,
                reference,
                reads,
                epochs,
                batch_sizes,
                rnn_dims,
                embedding_dims,
                replicates,
                patience,
                seed_range_upper,
                log_samples=False):
    include_reverse_complement = True

    importer = SequenceImporter()
    reverse_complementer = SequenceReverser()
    reads = importer.import_fastq([reads], include_reverse_complement)

    reference_sequence = importer.import_fasta([reference])
    reference_sequences = [
        reference_sequence[0],
        reverse_complementer.reverse_complement(reference_sequence[0]),
        reference_sequence[1],
        reverse_complementer.reverse_complement(reference_sequence[1])
    ]
    lengths = np.array(list(map(lambda x: len(x), reference_sequences)))

    with_gpu = True
    log_samples = log_samples
    log_training = False
    early_stopping = True
    legend = ['Mean Sequence']

    terminal_directory_character = dir_utils.get_terminal_directory_character()

    for batch_size in batch_sizes:
        for embedding_dim in embedding_dims:
            for latent_dim in rnn_dims:
                for i in range(replicates):
                    weights_path = dir_utils.clean_directory_string(
                        base_directory)
                    inner_directory = "BS_" + str(batch_size) + "_ED_" + str(embedding_dim) + "_LD_" + str(latent_dim) \
                                      + "_E_" + str(epochs) + "_R_" + str(i)

                    writer_path = weights_path + inner_directory + terminal_directory_character

                    dir_utils.mkdir(weights_path)
                    dir_utils.mkdir(writer_path)

                    model = GapPredictModel(
                        min_seed_length=min_seed_length,
                        stateful=False,
                        batch_size=batch_size,
                        epochs=epochs,
                        embedding_dim=embedding_dim,
                        latent_dim=latent_dim,
                        with_gpu=with_gpu,
                        log_samples=log_samples,
                        reference_sequences=reference_sequences,
                        log_training=log_training,
                        early_stopping=early_stopping,
                        patience=patience,
                        seed_range_upper=seed_range_upper,
                        base_path=base_directory)

                    start_time = time.time()
                    history = model.fit(reads)
                    model.save_weights(weights_path + 'my_model_weights.h5')
                    end_time = time.time()
                    print("Fitting took " + str(end_time - start_time) + "s")

                    accuracy, loss = model.validation_history()
                    if log_training:
                        training_accuracy = model.training_history()
                    else:
                        training_accuracy = history.history['acc']
                    training_loss = history.history['loss']
                    if early_stopping:
                        best_epoch = model.get_best_epoch()
                    _plot_training_validation(epochs,
                                              accuracy,
                                              loss,
                                              training_accuracy,
                                              training_loss,
                                              lengths,
                                              writer_path,
                                              legend=legend,
                                              best_epoch=best_epoch)
class TestSequenceImporter(TestCase):
    def setUp(self):
        self.importer = SequenceImporter()
        self.converter = BaseQualityConverter()
        self.reverser = SequenceReverser()

    def test_import_fastq(self):
        sequences = self.importer.import_fastq([
            '../data/mini_read_for_test.fastq',
            '../data/mini_read_for_test_2.fastq.gz'
        ])
        expected_sequences = np.array([
            "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA",
            "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA",
            "GTACAGCTCAGAAAGCGGAGTTGCGCCAAGATTGTTAACCAGCGCAATCACCCGATCGCCAGACTGGAGCGGTTGTTTGGTTTGTTGTTCTTCCTGCC",
            "TTACCTTGTGGAGCGACATCCAGAGGCACTTCACCGCTTGCCAGCGGCTTACCATCCAGCGCCACCATCCAGTGCAGGAGCTCGTTATCGCTATGACG"
        ])
        np.testing.assert_array_equal(expected_sequences, sequences)

    def test_import_fasta(self):
        sequences = self.importer.import_fasta([
            '../data/mini_read_for_test.fasta',
            '../data/mini_read_for_test_2.fasta.gz'
        ])
        expected_sequences = np.array([
            "CTGTTGAGCTATACTGTGGGAAGTCTGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGAATTTCTCCGATAATTTATTGTTTATCATGATCGCCAGCGTATCGTTTATCTATTTGCTG",
            "AATTTCTCCGATAATTTATTGTTTATCATGATCGCCAGCGTATCGTTTATCTATTTGCTGCTGTTGAGCTATACTGTGGGAAGTCTGCTTGGCCCGTCATTTACCGCTATGCTAATGCAG",
            "ATGATCGCCAGCGTATCGTTTATCTATTTGCTGAATTTCTCCGATAATTTATTGTTTATCGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGCTGTTGAGCTATACTGTGGGAAGTCT",
            "ATGATCGCCAGCGTATCGTTTATCTATTTGCTGAATTTCTCCGATAATTTATTGTTTATCGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGCTGTTGAGCTATACTGTGGGAAGTCT",
            "CTGTTGAGCTATACTGTGGGAAGTCTGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGAATTTCTCCGATAATTTATTGTTTATCATGATCGCCAGCGTATCGTTTATCTATTTGCTG",
            "AATTTCTCCGATAATTTATTGTTTATCATGATCGCCAGCGTATCGTTTATCTATTTGCTGCTGTTGAGCTATACTGTGGGAAGTCTGCTTGGCCCGTCATTTACCGCTATGCTAATGCAG",
            "ATGATCGCCAGCGTATCGTTTATCTATTTGCTGAATTTCTCCGATAATTTATTGTTTATCGCTTGGCCCGTCATTTACCGCTATGCTAATGCAGCTGTTGAGCTATACTGTGGGAAGTCT"
        ])
        np.testing.assert_array_equal(expected_sequences, sequences)

    def test_import_fastq_with_reverse_complement(self):
        sequences = self.importer.import_fastq([
            '../data/mini_read_for_test.fastq',
            '../data/mini_read_for_test_2.fastq.gz'
        ], True)
        expected_sequences = np.array([
            "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA",
            self.reverser.reverse_complement(
                "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA"
            ),
            "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA",
            self.reverser.reverse_complement(
                "AATTGAGTCGTAGTATCCACACCAAGCCGGCGTTATCCGGTGAGGCGCAATGTTGCGGGGGCTTTATCCCTGGTGGCATTGGTTGCTGGAAAGAGAAA"
            ),
            "GTACAGCTCAGAAAGCGGAGTTGCGCCAAGATTGTTAACCAGCGCAATCACCCGATCGCCAGACTGGAGCGGTTGTTTGGTTTGTTGTTCTTCCTGCC",
            self.reverser.reverse_complement(
                "GTACAGCTCAGAAAGCGGAGTTGCGCCAAGATTGTTAACCAGCGCAATCACCCGATCGCCAGACTGGAGCGGTTGTTTGGTTTGTTGTTCTTCCTGCC"
            ),
            "TTACCTTGTGGAGCGACATCCAGAGGCACTTCACCGCTTGCCAGCGGCTTACCATCCAGCGCCACCATCCAGTGCAGGAGCTCGTTATCGCTATGACG",
            self.reverser.reverse_complement(
                "TTACCTTGTGGAGCGACATCCAGAGGCACTTCACCGCTTGCCAGCGGCTTACCATCCAGCGCCACCATCCAGTGCAGGAGCTCGTTATCGCTATGACG"
            )
        ])
        np.testing.assert_array_equal(expected_sequences, sequences)

    def test_no_files_fastq(self):
        sequences = self.importer.import_fastq([], True)
        np.testing.assert_array_equal(sequences, [])

    def test_bogus_file_fastq(self):
        try:
            self.importer.import_fastq(['data/blahblahblah.fastq'])
            self.fail()
        except FileNotFoundError as e:
            pass
        except Exception as e:
            self.fail()

    def test_no_files_fasta(self):
        sequences = self.importer.import_fasta([])
        np.testing.assert_array_equal(sequences, [])

    def test_bogus_file_fasta(self):
        try:
            self.importer.import_fasta(['data/blahblahblah.fasta'])
            self.fail()
        except FileNotFoundError as e:
            pass
        except Exception as e:
            self.fail()
Beispiel #6
0
def predict_reference(weights_path,
                      gap_id,
                      fasta_path,
                      embedding_dim,
                      latent_dim,
                      min_seed_length,
                      plots=False,
                      base_path=None):
    importer = SequenceImporter()
    reverser = SequenceReverser()

    sequences = importer.import_fasta([fasta_path])[0:2]

    stateful_model = GapPredictModel(min_seed_length=min_seed_length,
                                     stateful=True,
                                     batch_size=1,
                                     embedding_dim=embedding_dim,
                                     latent_dim=latent_dim,
                                     with_gpu=True)
    stateless_model = GapPredictModel(min_seed_length=min_seed_length,
                                      stateful=False,
                                      batch_size=1,
                                      embedding_dim=embedding_dim,
                                      latent_dim=latent_dim,
                                      with_gpu=True)

    stateful_model.load_weights(weights_path + 'my_model_weights.h5')
    stateless_model.load_weights(weights_path + 'my_model_weights.h5')

    terminal_directory_character = UTILS.get_terminal_directory_character()
    first_directory = "regenerate_seq" + terminal_directory_character
    forward_left_flank = None
    rc_left_flank = None
    forward_right_flank = None
    rc_right_flank = None
    for i in range(len(sequences)):
        sequence = sequences[i]

        #TODO: messy since we assume that i will always be [0, 1]
        if i == 0:
            static_path = first_directory + "left_flank" + terminal_directory_character
        elif i == 1:
            static_path = first_directory + "right_flank" + terminal_directory_character

        forward_predict = predict_flanks(stateful_model,
                                         stateless_model,
                                         min_seed_length,
                                         sequence,
                                         base_path=base_path,
                                         directory=static_path + "forward")
        reverse_predict = predict_flanks(stateful_model,
                                         stateless_model,
                                         min_seed_length,
                                         reverser.reverse_complement(sequence),
                                         base_path=base_path,
                                         directory=static_path +
                                         "reverse_complement")

        if i == 0:
            forward_left_flank = forward_predict
            rc_left_flank = reverse_predict
        elif i == 1:
            forward_right_flank = forward_predict
            rc_right_flank = reverse_predict

    writer = DataWriter(root_directory=base_path, directory=first_directory)
    writer.write_flank_predict_fasta(forward_left_flank, rc_left_flank,
                                     forward_right_flank, rc_right_flank,
                                     latent_dim, gap_id)