Exemple #1
0
 def test_CreateHdpTrainingData(self):
     with tempfile.TemporaryDirectory() as tempdir:
         # create fast5 dir
         test_fast5 = os.path.join(tempdir, "test.fast5")
         copyfile(self.fast5_paths[0], test_fast5)
         # create fofn
         test_out = os.path.join(tempdir, "test.hdp.tsv")
         test_args = create_sa_sample_args(
             fast5_dirs=[tempdir],
             name="some_name",
             fw_reference=self.ecoli_reference,
             bwa_reference=self.ecoli_reference,
             number_of_kmer_assignments=1,
             probability_threshold=0,
             kmers_from_reference=False)
         working_folder = FolderHandler()
         working_folder.open_folder(os.path.join(tempdir, "test_dir"))
         sample = SignalAlignSample(working_folder=working_folder,
                                    **test_args)
         sample.analysis_files = [
             self.assignment_file, self.assignment_file
         ]
         out_path = CreateHdpTrainingData(
             [sample],
             test_out,
             template=True,
             complement=False,
             verbose=False).write_hdp_training_file()
         n_lines = count_lines_in_file(out_path)
         self.assertEqual(n_lines, 3182)
         with open(out_path, 'r') as fh1, open(self.test_hdp_training_data,
                                               'r') as fh2:
             self.assertEqual(sorted(list(fh1)), sorted(list(fh2)))
 def test_trim_num_files_in_sample(self):
     with tempfile.TemporaryDirectory() as tempdir:
         working_folder = FolderHandler()
         working_folder.open_folder(os.path.join(tempdir, "test_dir"))
         test_args = create_sa_sample_args(
             fast5_dirs=[self.fast5_dir],
             name="some_name",
             fw_reference=self.ecoli_reference)
         sample = SignalAlignSample(working_folder=working_folder,
                                    **test_args)
         n_bases = 10000
         fast5_files = trim_num_files_in_sample(sample,
                                                n_bases,
                                                False,
                                                verbose=False)
         bases = 0
         for fast5_file in fast5_files:
             bases += get_1d_length(fast5_file)
         self.assertLessEqual(bases, n_bases)
         fast5_files = trim_num_files_in_sample(sample,
                                                n_bases,
                                                True,
                                                verbose=False)
         bases = 0
         for fast5_file in fast5_files:
             bases += get_2d_length(fast5_file)
         self.assertLessEqual(bases, n_bases)
         self.assertRaises(AssertionError,
                           trim_num_files_in_sample,
                           sample,
                           1,
                           False,
                           verbose=False)
    def test_processReferenceFasta_positions(self):
        with tempfile.TemporaryDirectory() as tempdir:
            work_folder = FolderHandler()
            work_folder.open_folder(os.path.join(tempdir, "test_outdir"))
            forward_ref, backward_ref = processReferenceFasta(
                self.reference,
                work_folder,
                motifs=None,
                positions_file=self.ambiguity_positions_file)
            title, comment, seq = read_fasta(forward_ref).__next__()
            self.assertEqual(
                seq,
                "ABTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTCTCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAATTCGAGCTCGGTACCCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAAT"
            )
            title, comment, seq = read_fasta(backward_ref).__next__()
            self.assertEqual(
                seq,
                "TBATAACTTCGTAAATAGTCCCAATAACAGAGTACTCGCCTATGTATAAACTTACATAAATCTTTTTATTTGTTTATCCCCAAGGCGCGTGTAAAGGGGCTTTTCACGGTGGACTGCAGATTCTTTGGTAATAATAGTACTGTAATTGGATATTTTTATCCGCATAGTGCTCCGGGAAAGCAGAGCGCGCAAAGCCACTACTGCCACTTTTGGAGACTGTGTACGTCGAGGGCCTCTGCCAGTGTCGAACAGACATTCGCCTACGGCCCTCGTCTGTTCGGGCAGTCCCGCGCAGTCGCCCACAACCGCCCACAGCCCCGACCGAATTGATACGCCGTAGTCTCGTCTAACATGACTCTCACGTGGTATACGCCACACTTTATGGCGTGTCTACGCATTCCTCTTTTATGGCGTAGTCCGCGGTAAGCGGTAAGTCCGACGCGTTGACAACCCTTCCCGCTAGCCACGCCCGGAGAAGCGATAATGCGGTCGACCGCTTTCCCCCTACACGACGTTCCGCTAATTCAACCCATTGCGGTCCCAAAAGGGTCAGTGCTGCAACATTTTGCTGCCGGTCACTTAAGCTCGAGCCATGGGCCCCTAGGAGATCTCAGCTGGACGTCCGTACGTTCGAACCGCATTAGTACCAGTATCGACAAAGGACACACTTTAACAATAGGCGAGTGTTAAGGTGTGTTGTATGCTCGGCCTTCGTATTTCACATTTCGGACCCCACGGATTACTCACTCGATTGAGTGTAATTAACGCAACGCGAGTGACGGGCGAAAGGTCAGCCCTTTGGACAGCACGGTCGACGTAATTACTTAGCCGGTTGCGCGCCCCTCTCCGCCAAACGCATAACCCGCGAGAAGGCGAAGGAGCGAGTGACTGAGCGACGCGAGCCAGCAAGCCGACGCCGCTCGCCATAGTCGAGTGAGTTTCCGCCATTATGCCAATAGGTGTCTTAGTCCCCTATTGCGTCCTTTCTTGTACACTCGTTTTCCGGTCGTTTTCCGGTCCTTGGCATTTTTCCGGCGCAACGACCGCAAAAAGGTATCCGAGGCGGGGGGACTGCTCGTAGTGTTTTTAGCTGCGAGTTCAGTCTCCACCGCTTTGGGCTGTCCTGATATTTCTATGGTCCGCAAAGGGGGACCTTCGAGGGAGCACGCGAGAGGACAAGGCTGGGACGGCGAATGGCCTATGGACAGGCGGAAAGAGGGAAGCCCTTCGCACCGCGAAAGAGTATCGAGTGCGACATCCATAGAGTCAAGCCACATCCAGCAAGCGAGGTTCGACCCGACACACGTGCTTGGGGGGCAAGTCGGGCTGGCGACGCGGAATAGGCCATTGATAGCAGAACTCAGGTTGGGCCATTCTGTGCTGAATAGCGGTGACCGTCGTCGGTGACCATTGTCCTAATCGTCTCGCTCCATACATCCGCCACGATGTCTCAAGAACTTCACCACCGGATTGATGCCGATGTGATCTTCTTGTCATAAACCATAGACGCGAGACGACTTCGGTCAATGGAAGCCTTTTTCTCAACCATCGAGAACTAGGCCGTTTGTTTGGTGGCGACCATCGCCACCAAAAAAACAAACGTTCGTCGTCTAATGCGCGTCTTTTTTTCCTAGAGTTCTTCTAGGAAACTAGAAAAGATGCCCCAGACTGCGAGTCACCTTGCTTTTGAGTGCAATTCCCTAAAACCAGTACTCTAATAGTTTTTCCTAGAAGTGGATCTAGGAAAATTTAATTTTTACTTCAAAATTTAGTTAGATTTCATATATACTCATTTGAACCAGACTGTCAATGGTTACGAATTAGTCACTCCGTGGATAGAGTCGCTAGACAGATAAAGCAAGTAGGTATCAACGGACTGAGGGGCAGCACATCTATTGATGCTATGCCCTCCCGAATGGTAGACCGGGGTCACGACGTTACTATGGCGCTCTGGGTGCGAGTGGCCGAGGTCTAAATAGTCGTTATTTGGTCGGTCGGCCTTCCCGGCTCGCGTCTTCACCAGGACGTTGAAATAGGCGGAGGTAGGTCAGATAATTAACAACGGCCCTTCGATCTCATTCATCAAGCGGTCAATTATCAAACGCGTTGCAACAACGGTAACGATGTCCGTAGCACCACAGTGCGAGCAGCAAACCATACCGAAGTAAGTCGAGGCCAAGGGTTGCTAGTTCCGCTCAATGTACTAGGGGGTACAACACGTTTTTTCGCCAATCGAGGAAGCCAGGAGGCTAGCAACAGTCTTCATTCAACCGGCGTCACAATAGTGAGTACCAATACCGTCGTGACGTATTAAGAGAATGACAGTACGGTAGGCATTCTACGAAAAGACACTGACCACTCATGAGTTGGTTCAGTAAGACTCTTATCACATACGCCGCTGGCTCAACGAGAACGGGCCGCAGTTATGCCCTATTATGGCGCGGTGTATCGTCTTGAAATTTTCACGAGTAGTAACCTTTTGCAAGAAGCCCCGCTTTTGAGAGTTCCTAGAATGGCGACAACTCTAGGTCAAGCTACATTGGGTGAGCACGTGGGTTGACTAGAAGTCGTAGAAAATGAAAGTGGTCGCAAAGACCCACTCGTTTTTGTCCTTCCGTTTTACGGCGTTTTTTCCCTTATTCCCGCTGTGCCTTTACAACTTATGAGTATGAGAAGGAAAAAGTTA"
            )
            forward_ref, backward_ref = processReferenceFasta(
                self.reference, work_folder, motifs=None, positions_file=None)
            title, comment, seq = read_fasta(forward_ref).__next__()
            self.assertEqual(
                seq,
                "ATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTCTCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCCGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGCGCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAATTCGAGCTCGGTACCCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGCTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGAACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAAT"
            )
            self.assertIsNone(backward_ref)
            self.assertEqual(forward_ref, self.reference)

            self.assertRaises(RuntimeError,
                              processReferenceFasta,
                              self.reference,
                              work_folder,
                              motifs="something",
                              positions_file=self.ambiguity_positions_file)
Exemple #4
0
    def __init__(self,
                 in_fast5,
                 destination,
                 stateMachineType,
                 bwa_index,
                 in_templateHmm,
                 in_complementHmm,
                 in_templateHdp,
                 in_complementHdp,
                 threshold,
                 diagonal_expansion,
                 constraint_trim,
                 degenerate,
                 twoD_chemistry,
                 forward_reference,
                 backward_reference=None,
                 target_regions=None,
                 output_format="full",
                 embed=False,
                 event_table=False):
        self.in_fast5 = in_fast5  # fast5 file to align
        self.destination = destination  # place where the alignments go, should already exist
        self.stateMachineType = stateMachineType  # flag for signalMachine
        self.bwa_index = bwa_index  # index of reference sequence
        self.threshold = threshold  # min posterior probability to keep
        self.diagonal_expansion = diagonal_expansion  # alignment algorithm param
        self.constraint_trim = constraint_trim  # alignment algorithm param
        self.output_format = output_format  # smaller output files
        self.degenerate = degenerate  # set of nucleotides for degenerate characters
        self.twoD_chemistry = twoD_chemistry  # flag for 2D sequencing runs
        self.temp_folder = FolderHandler(
        )  # object for holding temporary files (non-toil)
        self.read_name = self.in_fast5.split(
            "/")[-1][:-6]  # get the name without the '.fast5'
        self.target_regions = target_regions
        self.output_formats = {"full": 0, "variantCaller": 1, "assignments": 2}
        self.embed = embed  # embed the output into the fast5 file
        self.event_table = event_table  # specify which event table to use to generate alignments
        self.backward_reference = backward_reference  # fasta path to backward reference if modified bases are used
        self.forward_reference = forward_reference  # fasta path to forward reference

        if (in_templateHmm is not None) and os.path.isfile(in_templateHmm):
            self.in_templateHmm = in_templateHmm
        else:
            self.in_templateHmm = None
        if (in_complementHmm is not None) and os.path.isfile(in_complementHmm):
            self.in_complementHmm = in_complementHmm
        else:
            self.in_complementHmm = None

        # similarly for HDPs
        if (in_templateHdp is not None) and os.path.isfile(in_templateHdp):
            self.in_templateHdp = in_templateHdp
        else:
            self.in_templateHdp = None
        if (in_complementHdp is not None) and os.path.isfile(in_complementHdp):
            self.in_complementHdp = in_complementHdp
        else:
            self.in_complementHdp = None
    def test_mea_alignment_close_to_guide(self):
        from signalalign.validateSignalAlignment import get_all_event_summaries, ABS_SA_ALIGNMENT_DIFF, MEA
        from signalalign.utils.fileHandlers import FolderHandler
        from signalalign.signalAlignment import create_signalAlignment_args
        import shutil
        import tempfile
        import glob

        ecoli_reference = os.path.join(MeaTest.HOME, "tests/test_sequences/E.coli_K12.fasta")
        fast5_dir = os.path.join(MeaTest.HOME, "tests/minion_test_reads/1D")
        template_hmm = os.path.join(MeaTest.HOME, "models/testModelR9_acgt_template.model")
        path_to_bin = os.path.join(MeaTest.HOME, 'bin')
        threshold = 11

        # make directory to put temporary files and output location
        output_root = tempfile.TemporaryDirectory()
        temp_root = FolderHandler()
        temp_fast5_dir = temp_root.open_folder(os.path.join(output_root.name, "temp_fast5"))
        temp_signal_align_dir = os.path.join(output_root.name, "temp_signalAlign")
        if os.path.isdir(temp_signal_align_dir):
            shutil.rmtree(temp_signal_align_dir)
            assert not os.path.isdir(temp_signal_align_dir)
        temp_signal_align = temp_root.open_folder(temp_signal_align_dir)

        # get input files
        orig_fast5s = glob.glob(os.path.join(fast5_dir, "*.fast5"))
        self.assertTrue(len(orig_fast5s) > 0, "Incorrect fast5 location: {}".format(fast5_dir))
        fast5s = list()
        for file in orig_fast5s:
            dest = os.path.join(temp_fast5_dir, os.path.basename(file))
            shutil.copy(file, dest)
            fast5s.append(dest)

        # get alignment args
        alignment_args = create_signalAlignment_args(bwa_reference=ecoli_reference,
                                                     in_templateHmm=template_hmm,
                                                     destination=temp_signal_align_dir,
                                                     forward_reference=ecoli_reference,
                                                     path_to_bin=path_to_bin,
                                                     constraint_trim=0,
                                                     traceBackDiagonals=100,
                                                     diagonal_expansion=0,
                                                     embed=True)

        # get summaries
        all_event_summaries = get_all_event_summaries(fast5s, alignment_args, aln_dist_threshold=threshold,
                                                      generate_plot=False, verbose=False)

        for fast5 in all_event_summaries.keys():
            f5_name = os.path.basename(fast5)
            event_summaries = all_event_summaries[fast5]
            max_mea_aln_diff = max(list(map(lambda x: x[ABS_SA_ALIGNMENT_DIFF],
                                        list(filter(lambda x: x[MEA], event_summaries)))))
            self.assertTrue(max_mea_aln_diff <= threshold,
                            "MEA produced alignment greater than {} positions from guide alignment for {}".format(
                                max_mea_aln_diff, f5_name))
    def test_embed_with_both(self):
        signal_file_reads = os.path.join(self.HOME,
                                         "tests/minion_test_reads/pUC/")
        template_model = os.path.join(
            self.HOME, "models/testModelR9_5mer_acegt_template.model")
        complement_model = os.path.join(
            self.HOME, "models/testModelR9_5mer_acegt_complement.model")

        puc_reference = os.path.join(self.HOME,
                                     "tests/test_sequences/pUC19_SspI.fa")
        signal_file_guide_alignment = os.path.join(
            self.HOME, "tests/minion_test_reads/pUC/puc.bam")
        with tempfile.TemporaryDirectory() as tempdir:
            new_dir = os.path.join(tempdir, "new_dir")
            if os.path.exists(new_dir):
                shutil.rmtree(new_dir)
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))

            shutil.copytree(signal_file_reads, new_dir)

            args = create_signalAlignment_args(
                alignment_file=signal_file_guide_alignment,
                bwa_reference=puc_reference,
                forward_reference=puc_reference,
                in_templateHmm=template_model,
                path_to_bin=self.path_to_bin,
                destination=working_folder.path,
                embed=True,
                output_format="both",
                filter_reads=0,
                twoD_chemistry=True,
                in_complementHmm=complement_model,
                delete_tmp=True)
            final_args = merge_dicts([
                args,
                dict(in_fast5=os.path.join(
                    new_dir,
                    "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5"
                ))
            ])
            handle = SignalAlignment(**final_args)
            handle.run()
            f5fh = Fast5(
                os.path.join(
                    new_dir,
                    "makeson_PC_20160807_FNFAD20242_MN17284_sequencing_run_MA_470_R9_pUC_g_PCR_BC_08_07_16_93165_ch1_read176_strand.fast5"
                ))
            mea = f5fh.get_signalalign_events(mea=True)
            sam = f5fh.get_signalalign_events(sam=True)
            self.assertEqual(mea[0]["raw_start"], 2879)
            self.assertEqual(sam[0], "0")
            self.assertEqual(len(os.listdir(working_folder.path)), 2)
 def test_processReferenceFasta_positions(self):
     with tempfile.TemporaryDirectory() as tempdir:
         work_folder = FolderHandler()
         work_folder.open_folder(os.path.join(tempdir, "test_outdir"))
         forward_ref, backward_ref = processReferenceFasta(self.reference, work_folder, motifs=[["AC", "EC"]],
                                                           positions_file=None, name="")
         title, comment, seq = read_fasta(forward_ref).__next__()
         self.assertEqual(seq.find("AC"), -1)
         self.assertEqual(seq.find("EC"), 42)
         title, comment, seq = read_fasta(backward_ref).__next__()
         self.assertEqual(seq.find("AC"), -1)
         self.assertEqual(seq.find("EC"), 5)
 def test_multithread_signal_alignment_samples(self):
     with tempfile.TemporaryDirectory() as tempdir:
         working_folder = FolderHandler()
         test_fast5 = os.path.join(
             tempdir,
             "miten_PC_20160820_FNFAD20259_MN17223_mux_scan_AMS_158_R9_WGA_Ecoli_08_20_16_83098_ch138_read23_strand.fast5"
         )
         num_files = 1
         copyfile(self.fast5_paths[0], test_fast5)
         working_folder.open_folder(os.path.join(tempdir, "test_dir"))
         # create signalalign args
         signal_align_arguments = create_signalAlignment_args(
             in_templateHmm=self.template_hmm,
             destination=working_folder.path,
             path_to_bin=self.path_to_bin)
         # create samples
         samples = []
         options = create_sa_sample_args(fast5_dirs=[tempdir],
                                         name="some_name",
                                         fw_reference=self.ecoli_reference,
                                         bwa_reference=self.ecoli_reference,
                                         readdb=self.fast5_readdb,
                                         alignment_file=self.fast5_bam)
         samples.append(
             SignalAlignSample(working_folder=working_folder, **options))
         options["name"] = "some_name2"
         samples.append(
             SignalAlignSample(working_folder=working_folder, **options))
         # with captured_output() as (out, err):
         samples = multithread_signal_alignment_samples(
             samples, signal_align_arguments, 2)
         self.assertSetEqual(set([sample.name for sample in samples]),
                             {'some_name', 'some_name2'})
         for sample in samples:
             if sample.name == "some_name":
                 self.assertEqual(len(sample.analysis_files), num_files)
             if sample.name == "some_name2":
                 self.assertEqual(len(sample.analysis_files), num_files)
             for file_path in sample.analysis_files:
                 self.assertTrue(os.path.isfile(file_path))
         options["name"] = "some_name"
         samples.append(
             SignalAlignSample(working_folder=working_folder, **options))
         self.assertRaises(AssertionError,
                           multithread_signal_alignment_samples, samples,
                           signal_align_arguments, 2)
    def test_variant_calling_with_multiple_paths_rna(self):
        with tempfile.TemporaryDirectory() as tempdir:
            new_dir = os.path.join(tempdir, "new_dir")
            if os.path.exists(new_dir):
                shutil.rmtree(new_dir)
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))

            shutil.copytree(self.test_dir_rna, new_dir)

            args = create_signalAlignment_args(
                alignment_file=self.rna_bam,
                bwa_reference=self.rna_reference,
                forward_reference=os.path.join(
                    self.HOME,
                    "tests/test_sequences/fake_rna_replace/forward.fake_rna_atg.fake_rna_ref.fa"
                ),
                backward_reference=os.path.join(
                    self.HOME,
                    "tests/test_sequences/fake_rna_replace/backward.fake_rna_atg.fake_rna_ref.fa"
                ),
                in_templateHmm=os.path.join(
                    self.HOME,
                    "models/fake_testModelR9p4_5mer_acfgt_RNA.model"),
                path_to_bin=self.path_to_bin,
                destination=working_folder.path,
                embed=False,
                output_format="full",
                filter_reads=0,
                twoD_chemistry=False,
                delete_tmp=True,
                degenerate="m6a",
                check_for_temp_file_existance=False)

            multithread_signal_alignment(args,
                                         list_dir(new_dir, ext="fast5"),
                                         worker_count=8,
                                         forward_reference=None,
                                         debug=True,
                                         filter_reads_to_string_wrapper=None)
            self.assertEqual(len(os.listdir(working_folder.path)), 2)
Exemple #10
0
    def __init__(self, args):
        # TODO Need to create docs here
        """Initialize all objects the training routine may need"""
        # executable
        self.buildHdpUtil = None
        # HDP type
        self.int_hdp_type = None
        # load json and create dot dictionary of all the parameters
        self.args = args
        # check output directory
        self.args.output_dir = os.path.abspath(self.args.output_dir)
        assert os.path.exists(self.args.output_dir), "Output directory does not exist. " \
                                                     "output_dir: {}".format(self.args.output_dir)
        self.working_folder = FolderHandler()
        self.working_path = self.working_folder.open_folder(
            os.path.join(self.args.output_dir, "tempFiles_trainModels"))

        # create samples from self.args.samples
        self.samples = self._create_samples()

        # Current model paths
        self.template_hmm_model_path = self.args.template_hmm_model
        self.template_hdp_model_path = self.args.template_hdp_model
        self.complement_hmm_model_path = self.args.complement_hmm_model
        self.complement_hdp_model_path = self.args.complement_hdp_model

        # Current SignalHmm model objects
        self.complement_model = None
        self.template_model = None

        # globals for experiments
        self.path_to_bin = self.args.path_to_bin
        self.debug = self.args.debug
        self.two_d = self.args.two_d
        self.job_count = self.args.job_count
        # state machine type changes for SignalAlignment so it can expect an HDP or not
        self.state_machine_type = "threeState"
        self.kmer_length = None
        self.alphabet = None
        # check config file
        self._check_config()
    def test_multithread_signal_alignment(self):
        with tempfile.TemporaryDirectory() as tempdir:
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))
            # create signalalign args
            assert os.path.isfile(self.template_hmm)
            signal_align_arguments = create_signalAlignment_args(
                bwa_reference=self.ecoli_reference,
                in_templateHmm=self.template_hmm,
                destination=working_folder.path,
                forward_reference=self.ecoli_reference,
                path_to_bin=self.path_to_bin)

            fast5_files = self.fast5_paths[:1]
            with captured_output() as (out, err):
                output_files = multithread_signal_alignment(
                    signal_align_arguments,
                    fast5_files,
                    2,
                    forward_reference=self.ecoli_reference)
            self.assertEqual(len(output_files), len(fast5_files))
    def test_SignalAlignSample(self):
        with tempfile.TemporaryDirectory() as tempdir:
            # create fast5 dir
            test_fast5 = os.path.join(tempdir, "test.fast5")
            copyfile(self.fast5_paths[0], test_fast5)
            # create fofn
            test_out = os.path.join(tempdir, "test.fofn")
            with open(test_out, 'w+') as fofn_file:
                print(test_fast5, file=fofn_file)

            test_args = create_sa_sample_args(
                fast5_dirs=[tempdir, tempdir],
                name="some_name",
                fofns=[test_out, test_out],
                fw_reference=self.ecoli_reference,
                bwa_reference=self.ecoli_reference)

            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))
            sample = SignalAlignSample(working_folder=working_folder,
                                       **test_args)
    def test_signal_file_and_alignment(self):
        signal_file_reads = os.path.join(
            self.HOME, "tests/minion_test_reads/no_event_data_1D_ecoli")
        template_model = os.path.join(
            self.HOME, "models/testModelR9p4_5mer_acegt_template.model")
        ecoli_reference = os.path.join(
            self.HOME, "tests/test_sequences/E.coli_K12.fasta")
        signal_file_guide_alignment = os.path.join(
            self.HOME, "tests/minion_test_reads/oneD_alignments.sam")

        with tempfile.TemporaryDirectory() as tempdir:
            new_dir = os.path.join(tempdir, "new_dir")
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))

            shutil.copytree(signal_file_reads, new_dir)

            args = create_signalAlignment_args(
                alignment_file=signal_file_guide_alignment,
                bwa_reference=ecoli_reference,
                forward_reference=ecoli_reference,
                in_templateHmm=template_model,
                path_to_bin=self.path_to_bin,
                destination=working_folder.path)
            final_args = merge_dicts([
                args,
                dict(in_fast5=os.path.join(
                    new_dir,
                    "LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5"
                ))
            ])
            handle = SignalAlignment(**final_args)
            handle.run()
            self.assertEqual(len(os.listdir(working_folder.path)), 1)
            self.assertEqual(
                sorted(os.listdir(working_folder.path))[0],
                "9e4d14b1-8167-44ef-9fdb-5c29dd0763fd.sm.backward.tsv")
def main(args):
    args = parse_args()

    # make directory to put temporary files
    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "npParamEstimation")

    fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")]

    if len(fast5s) > args.nb_files:
        shuffle(fast5s)
        fast5s = fast5s[:args.nb_files]
    for fast5 in fast5s:
        print(fast5)
        # estimate_params(fast5=args.files_dir + fast5, working_folder=temp_folder, bwa_index=bwa_ref_index,
        #                forward_reference_path=plus_strand_sequence, backward_reference_path=minus_strand_sequence,
        #                threshold=args.threshold)
        try:
            params = estimate_params(fast5=args.files_dir + fast5, twoD=True)
            print(params)
        except Exception as e:
            print(e)
    temp_folder.remove_folder()
    return True
def estimate_params(fast5, binary_path="./estimateNanoporeParams",
                    template_lookup_table="../models/testModelR9p4_acegt_template.model",
                    complement_lookup_table="../models/testModelR9_complement_pop2.model",
                    twoD=False, verbose=False):
    temp_folder = FolderHandler()
    temp_folder.open_folder("npParamEstimation")

    read_name = fast5.split("/")[-1][:-6]  # get the name without the '.fast5'

    npRead_path = temp_folder.add_file_path(read_name + ".npRead")
    npRead_fasta = temp_folder.add_file_path(read_name + ".seq.fasta")

    if twoD:
        success, version, complement = get_npRead_2dseq_and_models(fast5=fast5,
                                                                   npRead_path=npRead_path,
                                                                   twod_read_path=npRead_fasta)
        # print(version, complement)
    else:
        success, version, complement = prepareOneD(fast5=fast5, npRead_path=npRead_path, oneD_read_path=npRead_fasta)
        # print(version, complement)

    if success is False:
        return False

    command = "{bin} -T {tLuT} -C {cLuT} -q {npRead}" \
              "".format(bin=binary_path, tLuT=template_lookup_table, cLuT=complement_lookup_table, npRead=npRead_path)

    if verbose:
        print("running command {command}".format(command=command), file=sys.stderr)

    # os.system(command)
    result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
    params = result.split()
    param_dict = dict(list(zip([bytes.decode(x) for x in params[::2]], [float(x) for x in params[1::2]])))
    # print(type(param_dict["scale"]))
    # clean up temp folder
    temp_folder.remove_file(npRead_path)
    temp_folder.remove_file(npRead_fasta)
    temp_folder.remove_folder()
    return param_dict
Exemple #16
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
          file=sys.stderr)

    # get absolute paths to inputs
    args.files_dir = resolvePath(args.files_dir)
    args.forward_reference = resolvePath(args.forward_ref)
    args.backward_reference = resolvePath(args.backward_ref)
    args.out = resolvePath(args.out)
    args.bwa_reference = resolvePath(args.bwa_reference)
    args.in_T_Hmm = resolvePath(args.in_T_Hmm)
    args.in_C_Hmm = resolvePath(args.in_C_Hmm)
    args.templateHDP = resolvePath(args.templateHDP)
    args.complementHDP = resolvePath(args.complementHDP)
    args.fofn = resolvePath(args.fofn)
    args.target_regions = resolvePath(args.target_regions)
    args.ambiguity_positions = resolvePath(args.ambiguity_positions)
    start_message = """
#   Starting Signal Align
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: True
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir,
               reference=args.bwa_reference,
               nbFiles=args.nb_files,
               inThmm=args.in_T_Hmm,
               inChmm=args.in_C_Hmm,
               model=args.stateMachineType,
               regions=args.target_regions,
               tHdp=args.templateHDP,
               cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if args.files_dir is None and args.fofn is None:
        print("Need to provide directory with .fast5 files of fofn",
              file=sys.stderr)
        sys.exit(1)

    if not os.path.isfile(args.bwa_reference):
        print("Did not find valid reference file, looked for it {here}".format(
            here=args.bwa_reference),
              file=sys.stderr)
        sys.exit(1)

    # make directory to put temporary files
    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "/tempFiles_alignment")
    #
    if not args.forward_reference or not args.backward_reference:
        args.forward_reference, args.backward_reference = processReferenceFasta(
            fasta=args.bwa_reference,
            motifs=args.motifs,
            work_folder=temp_folder,
            positions_file=args.ambiguity_positions)

    # list of read files
    if args.fofn is not None:
        fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")]
    else:
        fast5s = [
            "/".join([args.files_dir, x]) for x in os.listdir(args.files_dir)
            if x.endswith(".fast5")
        ]

    nb_files = args.nb_files
    if nb_files < len(fast5s):
        shuffle(fast5s)
        fast5s = fast5s[:nb_files]

    # change paths to the source directory
    os.chdir(signalAlignSourceDir())
    alignment_args = {
        "destination": temp_dir_path,
        "stateMachineType": args.stateMachineType,
        "bwa_reference": args.bwa_reference,
        "in_templateHmm": args.in_T_Hmm,
        "in_complementHmm": args.in_C_Hmm,
        "in_templateHdp": args.templateHDP,
        "in_complementHdp": args.complementHDP,
        "output_format": args.outFmt,
        "threshold": args.threshold,
        "diagonal_expansion": args.diag_expansion,
        "constraint_trim": args.constraint_trim,
        "degenerate": getDegenerateEnum(args.degenerate),
        "twoD_chemistry": args.twoD,
        "target_regions": args.target_regions,
        "embed": args.embed,
        "event_table": args.event_table,
        "backward_reference": args.backward_reference,
        "forward_reference": args.forward_reference,
        "alignment_file": None,
        "check_for_temp_file_existance": True,
        "track_memory_usage": False,
        "get_expectations": False,
    }
    print("[runSignalAlign]:NOTICE: Got {} files to align".format(len(fast5s)),
          file=sys.stdout)
    # setup workers for multiprocessing
    multithread_signal_alignment(alignment_args, fast5s, args.nb_jobs)

    print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
    print("\n#  signalAlign - finished alignments\n", file=sys.stdout)
Exemple #17
0
 def new_working_folder(self, append):
     """Create new working folder in order to keep track of each new run of analysis"""
     self.working_folder = FolderHandler()
     self.working_path = self.working_folder.open_folder(
         os.path.join(self.args.output_dir,
                      "tempFiles_trainModels_" + str(append)))
Exemple #18
0
class TrainSignalAlign(object):
    """A single class which takes in the only config file used for training and allows for users to train
    either the transitions or emissions of the HMM model
    """
    # global hdp types for specific alphabets and 1D read options
    HDP_TYPES_ACEGOT = [
        ("singleLevelFixed", 0),
        ("singleLevelPrior", 1),
        ("multisetFixed", 2),
        ("multisetPrior", 3),
        ("compFixed", 4),
        ("compPrior", 5),
        ("middleNtsFixed", 6),
        ("middleNtsPrior", 7),
        ("groupMultisetFixed", 8),
        ("groupMultisetPrior", 9),
    ]

    HDP_TYPES_1D = [("singleLevelPrior2", 10), ("multisetPrior2", 11),
                    ("singleLevelFixedCanonical", 14)]

    HDP_TYPES_ACEGT = [
        ("singleLevelPrior2", 10),
        ("multisetPrior2", 11),
    ]

    HDP_TYPES_ACGT = [("singleLevelFixedCanonical", 14)]

    HDP_TYPES_ACEGIT = [
        ("multisetPriorEcoli", 12),
        ("singleLevelPriorEcoli", 13),
    ]

    def __init__(self, args):
        # TODO Need to create docs here
        """Initialize all objects the training routine may need"""
        # executable
        self.buildHdpUtil = None
        # HDP type
        self.int_hdp_type = None
        # load json and create dot dictionary of all the parameters
        self.args = args
        # check output directory
        self.args.output_dir = os.path.abspath(self.args.output_dir)
        assert os.path.exists(self.args.output_dir), "Output directory does not exist. " \
                                                     "output_dir: {}".format(self.args.output_dir)
        self.working_folder = FolderHandler()
        self.working_path = self.working_folder.open_folder(
            os.path.join(self.args.output_dir, "tempFiles_trainModels"))

        # create samples from self.args.samples
        self.samples = self._create_samples()

        # Current model paths
        self.template_hmm_model_path = self.args.template_hmm_model
        self.template_hdp_model_path = self.args.template_hdp_model
        self.complement_hmm_model_path = self.args.complement_hmm_model
        self.complement_hdp_model_path = self.args.complement_hdp_model

        # Current SignalHmm model objects
        self.complement_model = None
        self.template_model = None

        # globals for experiments
        self.path_to_bin = self.args.path_to_bin
        self.debug = self.args.debug
        self.two_d = self.args.two_d
        self.job_count = self.args.job_count
        # state machine type changes for SignalAlignment so it can expect an HDP or not
        self.state_machine_type = "threeState"
        self.kmer_length = None
        self.alphabet = None
        # check config file
        self._check_config()

    def _create_samples(self):
        """Create SignalAlignSample for each sample"""
        return [
            SignalAlignSample(working_folder=self.working_folder, **s)
            for s in self.args.samples
        ]

    def new_working_folder(self, append):
        """Create new working folder in order to keep track of each new run of analysis"""
        self.working_folder = FolderHandler()
        self.working_path = self.working_folder.open_folder(
            os.path.join(self.args.output_dir,
                         "tempFiles_trainModels_" + str(append)))

    def train_hdp(self):
        """Train hdp.... duh?
        :param outpath: output file path
        :param number_of_assignments: total number of assignments to collect FOR EACH GROUP
        :param build_alignment: path to alignment file
        :param num_alignments: number of alignments in alignment file
        :param threshold:
        :param verbose:
        :param path_to_bin
        :param twoD:
        :param hdp_type: Build Hdp, specify type, options: "Prior, Fixed, twoWay. twoWay is a Prior-type model (recommended)"
        # initial HDP
        :param template_model: Input template lookup table
        :param complement_model: Input complement lookup table
        # fixed concentration models
        :param base_gamma:
        :param middle_gamma:
        :param leaf_gamma:
        # gamma prior models
        :param base_alpha:
        :param base_beta:
        :param middle_alpha:
        :param middle_beta:
        :param leaf_alpha:
        :param leaf_beta:
        # gibbs sampling
        :param gibbs_samples: number of gibbs samples
        :param thinning: how many thinning draws?
        # sample grid
        :param grid_start:
        :param grid_end:
        :param grid_length:
        :param kmer_length: length of kmer
        :return: dictionary of hdp training options
        """
        if self.args.hdp_args.built_alignments:
            assert os.path.isfile(self.args.hdp_args.built_alignments), \
                "Build alignment file does not exist. {}".format(self.args.hdp_args.built_alignments)
            build_alignment_path = self.args.hdp_args.built_alignments
            num_alignments = count_lines_in_file(build_alignment_path)
        else:
            # set strands which will built
            template = True
            complement = False
            if self.two_d:
                complement = True
            # create instance
            hdp_data = CreateHdpTrainingData(self.samples,
                                             os.path.join(
                                                 self.working_path,
                                                 "buildAlignment.tsv"),
                                             template=template,
                                             complement=complement,
                                             verbose=self.debug)
            # write an hdp training file to path
            build_alignment_path = hdp_data.write_hdp_training_file()
            num_alignments = hdp_data.n_assignments

        verbose_flag = "--verbose " if self.debug is True else ""
        # create the output paths for the models
        template_hdp_location = os.path.join(
            self.working_path,
            "template." + self.args.hdp_args.hdp_type + ".nhdp")
        complement_hdp_location = None
        if self.two_d:
            one_d = None
            complement_hdp_location = os.path.join(
                self.working_path,
                "complement." + self.args.hdp_args.hdp_type + ".nhdp")
        else:
            one_d = '--oneD'

        # if we're making a HDP with fixed concentration parameters
        build_initial_hdp_command = "{buildHdpUtil} {verbose}-p {hdpType} -v {tHdpLoc} -w {cHdpLoc} -l {buildAln} " \
                                    "-a {kmerLength} -n {gibbs_samples} -I {burnIn} -t {thin} -s {start} -e {end} " \
                                    "-k {len} {oneD} -C {cL} -T {tL} " \
                                    "-g {Ba} -r {Bb} -j {Ma} -y {Mb} -i {La} -u {Lb} -B {base} -M {middle} -L {leaf} " \
                                    "".format(buildHdpUtil=self.buildHdpUtil,
                                              hdpType=self.int_hdp_type,
                                              tHdpLoc=template_hdp_location,
                                              cHdpLoc=complement_hdp_location,
                                              buildAln=build_alignment_path,
                                              gibbs_samples=self.args.hdp_args.gibbs_samples,
                                              burnIn=int(self.args.hdp_args.burnin_multiplier * num_alignments),
                                              thin=self.args.hdp_args.thinning,
                                              start=self.args.hdp_args.grid_start,
                                              end=self.args.hdp_args.grid_end,
                                              len=self.args.hdp_args.grid_length,
                                              verbose=verbose_flag,
                                              tL=self.template_hmm_model_path,
                                              cL=self.complement_hmm_model_path,
                                              kmerLength=self.kmer_length,
                                              oneD=one_d,
                                              Ba=self.args.hdp_args.base_alpha,
                                              Bb=self.args.hdp_args.base_beta,
                                              Ma=self.args.hdp_args.middle_alpha,
                                              Mb=self.args.hdp_args.middle_beta,
                                              La=self.args.hdp_args.leaf_alpha,
                                              Lb=self.args.hdp_args.leaf_beta,
                                              base=self.args.hdp_args.base_gamma,
                                              middle=self.args.hdp_args.middle_gamma,
                                              leaf=self.args.hdp_args.leaf_gamma)

        print("[[trainModels_buildHdpUtil] Command: {}\n".format(
            build_initial_hdp_command))
        procs = Popen(build_initial_hdp_command.split(),
                      stdout=sys.stdout,
                      stderr=sys.stderr)
        procs.wait()
        print(
            "[trainModels_buildHdpUtil] - finished training HDP emissions routine"
        )

        # check if the HDP created models
        assert os.path.exists(
            template_hdp_location
        ), "HDP training did not create template hdp model. {}".format(
            template_hdp_location)
        if complement_hdp_location:
            assert os.path.exists(
                complement_hdp_location
            ), "HDP training did not create complement hdp model. {}".format(
                complement_hdp_location)
        # set class parameters
        self.template_hdp_model_path = template_hdp_location
        self.complement_hdp_model_path = complement_hdp_location
        self.state_machine_type = "threeStateHdp"
        return self.template_hdp_model_path, self.complement_hdp_model_path

    def train_normal_hmm(self, transitions=True, emissions=False):
        """Train model transitions"""
        i = 0
        # start iterating
        while i < self.args.transitions_args.iterations:
            # align all the samples
            self.run_signal_align(
                get_expectations=True,
                trim=self.args.transitions_args.training_bases)
            all_sample_files = merge_lists(
                [sample.analysis_files for sample in self.samples])
            assert len(
                all_sample_files
            ) > 0, "Something failed in multithread signal alignment. We got no sample files"
            # load then normalize the expectations
            template_expectations_files = [
                x for x in all_sample_files
                if x.endswith(".template.expectations.tsv")
            ]

            if len(template_expectations_files) > 0:
                self.template_model.add_and_normalize_expectations(
                    files=template_expectations_files,
                    hmm_file=self.template_hmm_model_path,
                    update_transitions=transitions,
                    update_emissions=emissions)
            if self.two_d:
                complement_expectations_files = [
                    x for x in all_sample_files
                    if x.endswith(".complement.expectations.tsv")
                ]
                if len(complement_expectations_files) > 0:
                    self.complement_model.add_and_normalize_expectations(
                        files=complement_expectations_files,
                        hmm_file=self.complement_model_path,
                        update_transitions=transitions,
                        update_emissions=emissions)

            # log the running likelihood
            if len(self.template_model.running_likelihoods) > 0 and \
                    (self.two_d and len(self.complement_model.running_likelihoods)) > 0:
                print(
                    "[trainModels_transitions] {i}| {t_likelihood}\t{c_likelihood}"
                    .format(t_likelihood=self.template_model.
                            running_likelihoods[-1],
                            c_likelihood=self.complement_model.
                            running_likelihoods[-1],
                            i=i))
                if self.args.transitions_args.test and (len(self.template_model.running_likelihoods) >= 2) and \
                        (self.two_d and len(self.complement_model.running_likelihoods) >= 2):
                    assert (self.template_model.running_likelihoods[-2] < self.template_model.running_likelihoods[
                        -1]) and \
                           (self.complement_model.running_likelihoods[-2] < self.complement_model.running_likelihoods[
                               -1]), "Testing: Likelihood error, went up"
            elif len(self.template_model.running_likelihoods) > 0:
                print("[trainModels_transitions] {i}| {t_likelihood}".format(
                    t_likelihood=self.template_model.running_likelihoods[-1],
                    i=i))
                if self.args.transitions_args.test and (len(
                        self.template_model.running_likelihoods) >= 2):
                    assert (self.template_model.running_likelihoods[-2] <
                            self.template_model.running_likelihoods[-1]
                            ), "Testing: Likelihood error, went up"

            i += 1

        print(
            "[trainModels_transitions] - finished training transitions routine"
        )
        return self.template_hmm_model_path, self.complement_hmm_model_path

    def expectation_maximization_training(self):
        """Complete the entire pipeline of training a new HMM-HDP model

        Note: If expectation_maximization is set to true, both the transitions and hdp/hmm_emissions will be trained
        """
        start = timer()

        if self.args.training.normal_emissions:
            print(
                "[trainModels] Training HMM emission distributions is not currently available."
            )
        if self.args.training.expectation_maximization:
            for i in range(1, self.args.training.em_iterations + 1):
                print(
                    "[trainModels] Training HMM transition distributions. iteration: {}"
                    .format(i))
                # first train the model transitions
                self.train_normal_hmm()
                print(
                    "[trainModels] Running Assignment with new HMM transition distributions. "
                    "iteration: {}".format(i))
                # next get assignments
                self.run_signal_align()
                print(
                    "[trainModels] Training HDP emission distributions. iteration: {}"
                    .format(i))
                # make new hdp
                self.train_hdp()
                print([sample.analysis_files for sample in self.samples])
                print(self.template_hdp_model_path)
                print(self.template_hmm_model_path)
                print(self.complement_hmm_model_path)
                print(self.complement_hdp_model_path)
                # self.new_working_folder(append=str(i))
        elif self.args.training.transitions or self.args.training.hdp_emissions:
            if self.args.training.transitions:
                print("[trainModels] Training HMM transition distributions.")
                # self.train_transitions()
                self.train_normal_hmm()
            if self.args.training.hdp_emissions:
                print("[trainModels] Training HDP emission distributions.")
                if not self.args.hdp_args.built_alignments:
                    self.run_signal_align()
                self.train_hdp()
        else:
            raise AssertionError(
                "Must set one of the following to True. "
                "training.transitions: {}, training.hdp_emissions: {}, "
                "training.expectation_maximization: "
                "{}.".format(self.args.training.transitions,
                             self.args.training.hdp_emissions,
                             self.args.training.expectation_maximization))

        stop = timer()
        print("[trainModels] Complete")
        print("Training Time = {} seconds".format(stop - start))
        print(self.template_hmm_model_path, self.complement_hmm_model_path,
              self.template_hdp_model_path, self.complement_hdp_model_path)

        return self.template_hmm_model_path, self.complement_hmm_model_path, \
               self.template_hdp_model_path, self.complement_hdp_model_path

    def load_hmm_models(self):
        """Load in the correct models depending on what is going to be trained. """
        # load template model
        assert self.template_hmm_model_path, "Missing template model %s" % (
            self.template_hmm_model_path)
        self.template_hmm_model_path = os.path.abspath(
            self.template_hmm_model_path)
        self.template_model = HmmModel(self.template_hmm_model_path)
        new_template_hmm = self.working_folder.add_file_path(
            "template_trained.hmm")
        copyfile(self.template_hmm_model_path, new_template_hmm)
        assert os.path.exists(
            new_template_hmm), "Problem copying default model to {}".format(
                new_template_hmm)
        self.template_hmm_model_path = new_template_hmm
        # set alphabet and kmer_length
        self.kmer_length = self.template_model.kmer_length
        self.alphabet = self.template_model.alphabet
        # load complement model if 2D
        if self.two_d:
            assert self.complement_hmm_model_path, "Missing complement model: {}".format(
                self.complement_hmm_model_path)
            self.complement_hmm_model_path = os.path.abspath(
                self.complement_hmm_model_path)
            self.complement_model = HmmModel(self.complement_hmm_model_path)
            new_complement_hmm = self.working_folder.add_file_path(
                "complement_trained.hmm")
            copyfile(self.complement_hmm_model_path, new_complement_hmm)
            assert os.path.exists(
                new_complement_hmm
            ), "Problem copying default model to {}".format(new_complement_hmm)
            self.complement_hmm_model_path = new_complement_hmm
            # make sure models match
            assert self.complement_model.kmer_length == self.template_model.kmer_length, \
                "Template model and complement model kmer lengths do not match." \
                " template: {} != complement: {}".format(self.complement_model.kmer_length,
                                                         self.template_model.kmer_length)
            assert self.complement_model.alphabet == self.template_model.alphabet, \
                "Template model and complement model alphabets do not match." \
                " template: {} != complement: {}".format(self.complement_model.alphabet,
                                                         self.template_model.alphabet)
        # get the input HDP models, if they can be found
        if self.template_hdp_model_path:
            self.state_machine_type = "threeStateHdp"
            assert os.path.exists(self.template_hdp_model_path), \
                "Template HDP path not found {}".format(self.template_hdp_model_path)
            self.template_hdp_model_path = os.path.abspath(
                self.template_hdp_model_path)
            new_template_hdp = self.working_folder.add_file_path("{}".format(
                os.path.basename(self.template_hdp_model_path)))
            copyfile(self.template_hdp_model_path, new_template_hdp)
            self.complement_hdp_model_path = new_template_hdp
        # same for complement hdp
        if self.complement_hdp_model_path and self.two_d:
            assert os.path.exists(self.complement_hdp_model_path), \
                "Complement HDP path not found {}".format(self.complement_hdp_model_path)
            self.complement_hdp_model_path = os.path.abspath(
                self.complement_hdp_model_path)
            new_complement_hdp = \
                self.working_folder.add_file_path("{}".format(os.path.basename(self.complement_hdp_model_path)))
            copyfile(self.complement_hdp_model_path, new_complement_hdp)
            self.complement_hdp_model_path = new_complement_hdp

    def _check_train_transitions_config(self):
        assert isinstance(self.args.transitions_args.iterations, int), \
            "args.transitions_args.iterations must be an integer. {}".format(self.args.transitions_args.iterations)
        assert isinstance(self.args.job_count, int), \
            "args.job_count must be an integer. {}".format(self.args.job_count)

    def _check_train_hdp_config(self):
        """Check if the input parameters will for training the HDP."""
        # make sure hdp type works with alphabet and 1D
        self.int_hdp_type = get_hdp_type(self.args.hdp_args.hdp_type)
        if not self.args.two_d:
            assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_1D), \
                "HDP type is not compatible with 1D. {}: 1D types {}".format(self.args.hdp_type,
                                                                             self.HDP_TYPES_1D)
        if self.alphabet == "ACEGOT":
            assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_ACEGOT), \
                "HDP type is not compatible with alphabet=ACEGOT." \
                "Hdp_type: {}, ACEGOT HDP types:  {}".format(self.args.hdp_type, self.HDP_TYPES_ACEGOT)

        elif self.alphabet == "ACEGIT":
            assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_ACEGIT), \
                "HDP type is not compatible with alphabet=ACEGIT." \
                "Hdp_type: {}, ACEGIT HDP types:  {}".format(self.args.hdp_type, self.HDP_TYPES_ACEGIT)

        elif self.alphabet == "ACEGT":
            assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_ACEGT), \
                "HDP type is not compatible with alphabet=ACEGT." \
                "Hdp_type: {}, ACEGT HDP types:  {}".format(self.args.hdp_type, self.HDP_TYPES_ACEGT)

        elif self.alphabet == "ACGT":
            assert (self.args.hdp_args.hdp_type, self.int_hdp_type) in set(self.HDP_TYPES_ACGT), \
                "HDP type is not compatible with alphabet=ACGT." \
                "Hdp_type: {}, ACGT HDP types:  {}".format(self.args.hdp_type, self.HDP_TYPES_ACGT)
        else:
            raise AssertionError("Cannot create a HDP with proved alphabet")

        # check buildHdpUtil executable
        self.buildHdpUtil = os.path.join(self.args.path_to_bin,
                                         "./buildHdpUtil")
        assert (os.path.exists(
            self.buildHdpUtil)), "ERROR: Didn't find buildHdpUtil. {}".format(
                self.buildHdpUtil)
        # check other parameter inconsistencies
        if self.args.hdp_args.built_alignments:
            assert self.args.training.expectation_maximization is not True, "Cannot use 'built_alignments' file for " \
                                                                            "EM training. Either set " \
                                                                            "training.expectation_maximization to " \
                                                                            "false or change " \
                                                                            "hdp_args.built_alignments to null"
            assert os.path.isfile(self.args.hdp_args.built_alignments), \
                "Build alignment file does not exist. {}".format(self.args.hdp_args.built_alignments)

    def _check_config(self):
        """Make sure training configuration file is correctly filled out"""
        # check model files and load HMM models into memory for training transitions
        self.load_hmm_models()
        # check path to bin
        assert os.path.isdir(self.path_to_bin), "path_to_bin does not exist. " \
                                                "path_to_bin: {}".format(self.path_to_bin)

        # check if signalMachine is found
        assert os.path.exists(os.path.join(self.args.path_to_bin, "./signalMachine")), \
            "ERROR: Didn't find signalMachine executable. {}".format(os.path.join(self.args.path_to_bin,
                                                                                  "./signalMachine"))

        if self.args.training.transitions or self.args.training.expectation_maximization:
            self._check_train_transitions_config()

        if self.args.training.hdp_emissions or self.args.training.expectation_maximization:
            self._check_train_hdp_config()

        return self.args

    def run_signal_align(self,
                         output_format="assignments",
                         get_expectations=False,
                         trim=False):
        """Run signal align with specified arguments"""
        alignment_args = create_signalAlignment_args(
            destination=self.working_path,
            stateMachineType=self.state_machine_type,
            in_templateHmm=self.template_hmm_model_path,
            in_complementHmm=self.complement_hmm_model_path,
            in_templateHdp=self.template_hdp_model_path,
            in_complementHdp=self.complement_hdp_model_path,
            diagonal_expansion=self.args.diagonal_expansion,
            constraint_trim=self.args.constraint_trim,
            twoD_chemistry=self.two_d,
            get_expectations=get_expectations,
            path_to_bin=self.path_to_bin,
            check_for_temp_file_existance=True,
            threshold=self.args.signal_alignment_args.threshold,
            track_memory_usage=self.args.signal_alignment_args.
            track_memory_usage,
            embed=self.args.signal_alignment_args.embed,
            event_table=self.args.signal_alignment_args.event_table,
            output_format=output_format)

        self.samples = multithread_signal_alignment_samples(self.samples,
                                                            alignment_args,
                                                            self.job_count,
                                                            trim=trim)
        return self.samples
Exemple #19
0
    def test_get_sample_kmers(self):
        with tempfile.TemporaryDirectory() as tempdir:
            # create fast5 dir
            test_fast5 = os.path.join(tempdir, "test.fast5")
            copyfile(self.fast5_paths[0], test_fast5)
            # create fofn
            test_out = os.path.join(tempdir, "test.hdp.tsv")
            test_args = create_sa_sample_args(
                fast5_dirs=[tempdir],
                name="some_name",
                fw_reference=self.ecoli_reference,
                bwa_reference=self.ecoli_reference,
                number_of_kmer_assignments=1,
                probability_threshold=0,
                kmers_from_reference=False)
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))
            sample = SignalAlignSample(working_folder=working_folder,
                                       **test_args)
            sample.analysis_files = [
                self.assignment_file, self.assignment_file
            ]
            hdp_data_handle = CreateHdpTrainingData([sample],
                                                    test_out,
                                                    template=True,
                                                    complement=False,
                                                    verbose=False)
            kmers = hdp_data_handle.get_sample_kmers(sample)
            self.assertEqual(
                kmers, {x
                        for x in all_string_permutations("ATGC", length=6)})
            test_args = create_sa_sample_args(
                fast5_dirs=[tempdir],
                name="some_name",
                fw_reference=self.ecoli_reference,
                bwa_reference=self.ecoli_reference,
                number_of_kmer_assignments=1,
                probability_threshold=0,
                kmers_from_reference=False,
                motifs=[["ATGC", "ETGC"]])
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))
            sample = SignalAlignSample(working_folder=working_folder,
                                       **test_args)
            sample.analysis_files = [
                self.assignment_file, self.assignment_file
            ]
            hdp_data_handle = CreateHdpTrainingData([sample],
                                                    test_out,
                                                    template=True,
                                                    complement=False,
                                                    verbose=False)
            kmers = hdp_data_handle.get_sample_kmers(sample)
            self.assertEqual(
                kmers,
                get_motif_kmers(["ATGC", "ETGC"], 6, alphabet="ATGC")
                | {x
                   for x in all_string_permutations("ATGC", length=6)})
            test_args = create_sa_sample_args(
                fast5_dirs=[tempdir],
                name="some_name",
                fw_reference=self.ecoli_reference,
                bwa_reference=self.ecoli_reference,
                number_of_kmer_assignments=1,
                probability_threshold=0,
                kmers_from_reference=True,
                motifs=[["ATGC", "ETGC"]])
            working_folder = FolderHandler()
            working_folder.open_folder(os.path.join(tempdir, "test_dir"))
            sample = SignalAlignSample(working_folder=working_folder,
                                       **test_args)
            sample.analysis_files = [
                self.assignment_file, self.assignment_file
            ]
            hdp_data_handle = CreateHdpTrainingData([sample],
                                                    test_out,
                                                    template=True,
                                                    complement=False,
                                                    verbose=False)
            kmers = hdp_data_handle.get_sample_kmers(sample)
            expected_kmers = set()
            for _, _, sequence in read_fasta(self.ecoli_reference):
                expected_kmers |= get_sequence_kmers(sequence,
                                                     k=6,
                                                     rev_comp=True)

            self.assertEqual(
                kmers,
                get_motif_kmers(["ATGC", "ETGC"], 6, alphabet="ATGC")
                | expected_kmers)
Exemple #20
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr)

    start_message = """
#   Starting Zayante Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files, banding=args.banded,
               inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions,
               tHdp=args.templateHDP, cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "tempFiles_errorCorrection")

    reference_sequence = args.ref

    STEP = 10
    for cycle in range(0, 8):
        for it in range(0, STEP):
            # make paths for reference files
            forward_reference = temp_folder.add_file_path("forward_reference.{cycle}.{iter}.txt".format(cycle=cycle,
                                                                                                        iter=it))
            backward_reference = temp_folder.add_file_path("backward_reference.{cycle}.{iter}.txt".format(cycle=cycle,
                                                                                                          iter=it))

            # make N-ed reference sequence for this iteration
            deg, reference_sequence_length = make_degenerate_reference(reference_sequence, it,
                                                                       forward_reference, backward_reference,
                                                                       step=STEP)
            assert deg, "Problem making degenerate reference for cycle {cycle} iteration {iter}" \
                        "".format(cycle=cycle, iter=it)

            # index the reference for bwa
            print("signalAlign - indexing reference", file=sys.stderr)
            bwa_ref_index = get_bwa_index(args.ref, temp_dir_path)
            print("signalAlign - indexing reference, done", file=sys.stderr)

            # setup workers for multiprocessing
            workers = args.nb_jobs
            work_queue = Manager().Queue()
            done_queue = Manager().Queue()
            jobs = []

            # list of alignment files
            fast5s = [x for x in os.listdir(args.files_dir) if x.endswith(".fast5")]

            # take only some
            if args.nb_files < len(fast5s):
                shuffle(fast5s)
                fast5s = fast5s[:args.nb_files]

            for fast5 in fast5s:
                alignment_args = {
                    "forward_reference": forward_reference,
                    "backward_reference": backward_reference,
                    "path_to_EC_refs": None,
                    "destination": temp_dir_path,
                    "stateMachineType": args.stateMachineType,
                    "bwa_index": bwa_ref_index,
                    "in_templateHmm": args.in_T_Hmm,
                    "in_complementHmm": args.in_C_Hmm,
                    "in_templateHdp": args.templateHDP,
                    "in_complementHdp": args.complementHDP,
                    "banded": args.banded,
                    "sparse_output": True,
                    "in_fast5": args.files_dir + fast5,
                    "threshold": args.threshold,
                    "diagonal_expansion": args.diag_expansion,
                    "constraint_trim": args.constraint_trim,
                    "target_regions": None,
                    "degenerate": degenerate_enum(args.degenerate),
                }
                #alignment = SignalAlignment(**alignment_args)
                #alignment.run()
                work_queue.put(alignment_args)

            for w in range(workers):
                p = Process(target=aligner, args=(work_queue, done_queue))
                p.start()
                jobs.append(p)
                work_queue.put('STOP')

            for p in jobs:
                p.join()

            done_queue.put('STOP')

            print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
            print("\n#  signalAlign - finished alignments\n", file=sys.stdout)

            print("\n#  Starting Variant Calling\n", file=sys.stdout)
            print("\n#  Starting Variant Calling\n", file=sys.stderr)

            # cull the alignment files
            alns, forward_mask = get_alignments_labels_and_mask(temp_dir_path + "*.tsv", args.nb_files)

            degenerate_positions = {
                'forward': list(range(it, reference_sequence_length, STEP)),
                'backward': list(range(it, reference_sequence_length, STEP)) }

            variant_call_file = temp_folder.add_file_path("variants.{cycle}.{iter}.calls".format(cycle=cycle, iter=it))

            for aln, forward_bool in zip(alns, forward_mask):
                call_methyl_args = {
                    "sequence": None,
                    "alignment_file": aln,
                    "forward": forward_bool,
                    "out_file": variant_call_file,
                    "positions": degenerate_positions,
                    "degenerate_type": degenerate_enum(args.degenerate),
                }
                #c = CallMethylation(**call_methyl_args)
                #c.write()
                work_queue.put(call_methyl_args)

            for w in range(workers):
                p = Process(target=run_methyl_caller, args=(work_queue, done_queue))
                p.start()
                jobs.append(p)
                work_queue.put('STOP')

            for p in jobs:
                p.join()

            done_queue.put('STOP')

            print("\n#  Finished Variant Calling\n", file=sys.stdout)
            print("\n#  Finished Variant Calling\n", file=sys.stderr)

            new_ref = update_reference(variant_call_file, reference_sequence, 0)

            ref_path = temp_folder.add_file_path("iteration.{cycle}.{iter}.fa".format(cycle=cycle, iter=it))

            write_fasta("iteration.{cycle}.{iter}.fa".format(cycle=cycle, iter=it), new_ref, open(ref_path, 'w'))

            reference_sequence = ref_path

            # remove old alignments
            for f in glob.glob(temp_dir_path + "*.tsv"):
                os.remove(f)
        STEP -= 1
    return
Exemple #21
0
class SignalAlignment(object):
    def __init__(self,
                 in_fast5,
                 reference_map,
                 destination,
                 stateMachineType,
                 bwa_index,
                 in_templateHmm,
                 in_complementHmm,
                 in_templateHdp,
                 in_complementHdp,
                 threshold,
                 diagonal_expansion,
                 constraint_trim,
                 degenerate,
                 twoD_chemistry,
                 target_regions=None,
                 output_format="full"):
        self.in_fast5 = in_fast5  # fast5 file to align
        self.reference_map = reference_map  # map with paths to reference sequences
        self.destination = destination  # place where the alignments go, should already exist
        self.stateMachineType = stateMachineType  # flag for signalMachine
        self.bwa_index = bwa_index  # index of reference sequence
        self.threshold = threshold  # min posterior probability to keep
        self.diagonal_expansion = diagonal_expansion  # alignment algorithm param
        self.constraint_trim = constraint_trim  # alignment algorithm param
        self.output_format = output_format  # smaller output files
        self.degenerate = degenerate  # set of nucleotides for degenerate characters
        self.twoD_chemistry = twoD_chemistry  # flag for 2D sequencing runs
        self.temp_folder = FolderHandler(
        )  # object for holding temporary files (non-toil)
        self.read_name = self.in_fast5.split(
            "/")[-1][:-6]  # get the name without the '.fast5'
        self.target_regions = target_regions
        self.output_formats = {"full": 0, "variantCaller": 1, "assignments": 2}

        if (in_templateHmm is not None) and os.path.isfile(in_templateHmm):
            self.in_templateHmm = in_templateHmm
        else:
            self.in_templateHmm = None
        if (in_complementHmm is not None) and os.path.isfile(in_complementHmm):
            self.in_complementHmm = in_complementHmm
        else:
            self.in_complementHmm = None

        # similarly for HDPs
        if (in_templateHdp is not None) and os.path.isfile(in_templateHdp):
            self.in_templateHdp = in_templateHdp
        else:
            self.in_templateHdp = None
        if (in_complementHdp is not None) and os.path.isfile(in_complementHdp):
            self.in_complementHdp = in_complementHdp
        else:
            self.in_complementHdp = None

    def run(self, get_expectations=False):
        print("[SignalAlignment.run]INFO: Starting on {read}".format(
            read=self.in_fast5),
              file=sys.stderr)
        if get_expectations:
            assert self.in_templateHmm is not None and self.in_complementHmm is not None,\
                "Need HMM files for model training"
        # file checks
        if os.path.isfile(self.in_fast5) is False:
            print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        self.openTempFolder("tempFiles_%s" % self.read_name)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        npRead = NanoporeRead(fast_five_file=self.in_fast5,
                              twoD=self.twoD_chemistry)
        fH = open(npRead_, "w")
        ok = npRead.Write(parent_job=None, out_file=fH, initialize=True)
        fH.close()
        if not ok:
            self.failStop(
                "[SignalAlignment.run]File: %s did not pass initial checks" %
                self.read_name, npRead)
            return False

        read_label = npRead.read_label  # use this to identify the read throughout
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        if self.twoD_chemistry:
            ok, version, pop1_complement = self.prepare_twod(
                nanopore_read=npRead, twod_read_path=read_fasta_)
        else:
            ok, version, _ = self.prepare_oned(nanopore_read=npRead,
                                               oned_read_path=read_fasta_)
            pop1_complement = None

        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        guide_alignment = generateGuideAlignment(
            bwa_index=self.bwa_index,
            query=read_fasta_,
            temp_sam_path=temp_samfile_,
            target_regions=self.target_regions)
        ok = guide_alignment.validate(self.reference_map.keys())
        if not ok:
            self.failStop("[SignalAlignment.run]ERROR getting guide alignment",
                          npRead)
            return False

        cig_handle = open(cigar_file_, "w")
        cig_handle.write(guide_alignment.cigar + "\n")
        cig_handle.close()

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        posteriors_file_path = ''
        # forward strand
        if guide_alignment.strand == "+":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # backward strand
        if guide_alignment.strand == "-":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # Alignment/Expectations routine
        path_to_signalAlign = "./signalMachine"

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(strand="template",
                                                          version=version)
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.in_complementHmm = defaultModelFromVersion(
                    strand="complement",
                    version=version,
                    pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalALignment.run]NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm),
            file=sys.stderr)

        # reference sequences
        assert self.reference_map[
            guide_alignment.reference_name]["forward"] is not None
        assert self.reference_map[
            guide_alignment.reference_name]["backward"] is not None
        forward_reference = self.reference_map[
            guide_alignment.reference_name]["forward"]
        backward_reference = self.reference_map[
            guide_alignment.reference_name]["backward"]
        assert os.path.isfile(forward_reference)
        assert os.path.isfile(backward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=forward_reference)
        backward_ref_flag = "-b {b_ref} ".format(b_ref=backward_reference)

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in self.output_formats.keys():
            self.failStop(
                "[SignalAlignment.run]ERROR illegal outpur format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""
        # commands
        if get_expectations:
            template_expectations_file_path = self.destination + read_label + ".template.expectations"
            complement_expectations_file_path = self.destination + read_label + ".complement.expectations"

            command = \
                "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations}"\
                .format(vA=path_to_signalAlign, model=stateMachineType_flag,
                        f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_,
                        npRead=npRead_, readLabel=read_label, td=twoD_flag,
                        templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                        complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                        c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                        trim=trim_flag, degen=degenerate_flag, sparse=out_fmt)
        else:
            print("read_label", read_label)
            command = \
                "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel}"\
                .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt,
                        f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_,
                        readLabel=read_label, npRead=npRead_, td=twoD_flag,
                        t_model=template_model_flag, c_model=complement_model_flag,
                        posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                        trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag)

        # run
        print("signalAlign - running command: ",
              command,
              end="\n",
              file=sys.stderr)
        os.system(command)
        self.temp_folder.remove_folder()
        return True

    def prepare_oned(self, nanopore_read, oned_read_path):
        try:
            read_file = open(oned_read_path, "w")
            fastaWrite(fileHandleOrFile=read_file,
                       name=nanopore_read.read_label,
                       seq=nanopore_read.template_read)
            version = nanopore_read.version
            read_file.close()
            nanopore_read.close()
            return True, version, False
        except Exception:
            return False, None, False

    def prepare_twod(self, nanopore_read, twod_read_path):
        # check for table to make 'assembled' 2D alignment table fasta with
        if nanopore_read.has2D_alignment_table is False:
            nanopore_read.close()
            return False, None, False
        fasta_handle = open(twod_read_path, "w")
        fastaWrite(fileHandleOrFile=fasta_handle,
                   name=nanopore_read.read_label,
                   seq=nanopore_read.alignment_table_sequence)
        if nanopore_read.complement_model_id == "complement_median68pA_pop1.model":
            pop1_complement = True
        else:
            pop1_complement = False
        version = nanopore_read.version
        fasta_handle.close()
        nanopore_read.close()
        return True, version, pop1_complement

    def openTempFolder(self, temp_dir):
        self.temp_folder.open_folder("%s%s" % (self.destination, temp_dir))

    def addTempFilePath(self, path_to_add):
        return self.temp_folder.add_file_path(path_to_add)

    def failStop(self, message, nanopore_read=None):
        self.temp_folder.remove_folder()
        if nanopore_read is not None:
            nanopore_read.close()
        print(message, file=sys.stderr)
    def __init__(
            self,
            in_fast5,
            destination,
            stateMachineType,
            in_templateHmm,
            in_complementHmm,
            in_templateHdp,
            in_complementHdp,
            threshold,
            diagonal_expansion,
            constraint_trim,
            degenerate,
            forward_reference,
            backward_reference=None,
            # one of these needs to be set
            alignment_file=None,
            bwa_reference=None,
            # reasonable defaults
            twoD_chemistry=False,
            target_regions=None,
            output_format="full",
            embed=False,
            event_table=False,
            check_for_temp_file_existance=True,
            track_memory_usage=False,
            get_expectations=False,
            path_to_bin=''):
        self.in_fast5 = in_fast5  # fast5 file to align
        self.destination = destination  # place where the alignments go, should already exist
        self.stateMachineType = stateMachineType  # flag for signalMachine
        self.bwa_reference = bwa_reference  # path to reference sequence to generate guide alignment
        self.threshold = threshold  # min posterior probability to keep
        self.diagonal_expansion = diagonal_expansion  # alignment algorithm param
        self.constraint_trim = constraint_trim  # alignment algorithm param
        self.output_format = output_format  # smaller output files
        self.degenerate = degenerate  # set of nucleotides for degenerate characters
        self.twoD_chemistry = twoD_chemistry  # flag for 2D sequencing runs
        self.temp_folder = FolderHandler(
        )  # object for holding temporary files (non-toil)
        self.read_name = self.in_fast5.split(
            "/")[-1][:-6]  # get the name without the '.fast5'
        self.target_regions = target_regions
        self.output_formats = {"full": 0, "variantCaller": 1, "assignments": 2}
        self.embed = embed  # embed the output into the fast5 file
        self.event_table = event_table  # specify which event table to use to generate alignments
        self.backward_reference = backward_reference  # fasta path to backward reference if modified bases are used
        self.forward_reference = forward_reference  # fasta path to forward reference
        self.alignment_file = alignment_file  # guide aligments will be gotten from here if set
        self.check_for_temp_file_existance = check_for_temp_file_existance  # don't recreate if files exist
        self.track_memory_usage = track_memory_usage  # has the 'time' program append mem usage stats to output
        self.max_memory_usage_kb = None
        self.read_label = None
        self.get_expectations = get_expectations  # option to gather expectations of transitions and emissions
        self.path_to_signalMachine = os.path.join(
            path_to_bin, "signalMachine")  # path to signalMachine

        assert os.path.exists(
            self.path_to_signalMachine), "Path to signalMachine does not exist"
        assert self.bwa_reference is not None or self.alignment_file is not None, \
            "either 'bwa_reference' or 'alignment_file' argument is needed to generate cigar strings"

        if (in_templateHmm is not None) and os.path.isfile(in_templateHmm):
            self.in_templateHmm = in_templateHmm
        else:
            self.in_templateHmm = None
        if (in_complementHmm is not None) and os.path.isfile(in_complementHmm):
            self.in_complementHmm = in_complementHmm
        else:
            self.in_complementHmm = None

        # similarly for HDPs
        if (in_templateHdp is not None) and os.path.isfile(in_templateHdp):
            self.in_templateHdp = in_templateHdp
        else:
            self.in_templateHdp = None
        if (in_complementHdp is not None) and os.path.isfile(in_complementHdp):
            self.in_complementHdp = in_complementHdp
        else:
            self.in_complementHdp = None
        assert os.path.exists(self.destination), \
            "Destination path does not exist: {}".format(self.destination)
Exemple #23
0
def trainModelTransitions(config):
    def process_sample(sample):
        options = dict(**DEFAULT_TRAINMODELS_OPTIONS)
        options.update(sample)
        if options["fast5_dir"] is None and options["fofn"] is None:
            raise RuntimeError(
                "Need to provide path to .fast5 files or file with filenames (fofn)"
            )
        reference_map = processReferenceFasta(
            fasta=config["reference"],
            work_folder=working_folder,
            motif_key=options["motif"],
            sub_char=options["label"],
            positions_file=options["positions_file"])
        if options["fast5_dir"] is not None:
            if options["fofn"] is not None:
                print(
                    "WARNING Only using files is directory %s ignoring fofn %s"
                    % (options["files_dir"], options["fofn"]))
            sample = Fast5Directory(options["fast5_dir"], reference_map)
        else:
            sample = FileOfFilenames(options["fofn"], reference_map)
        return sample

    # make directory to put the files we're using
    working_folder = FolderHandler()
    working_folder_path = working_folder.open_folder(config["output_dir"] +
                                                     "temp_trainModels")
    samples = [process_sample(s) for s in config["samples"]]

    if config["bwt"] is not None:
        print("[trainModels]Using provided BWT")
        bwa_ref_index = config["bwt"]
    else:
        print("signalAlign - indexing reference", file=sys.stderr)
        bwa_ref_index = getBwaIndex(config["reference"], working_folder_path)
        print("signalAlign - indexing reference, done", file=sys.stderr)

    template_model_path = config["in_T_Hmm"]
    complement_model_path = config["in_C_Hmm"]
    assert os.path.exists(template_model_path) and os.path.exists(complement_model_path), \
        "Missing input models %s and %s" % (template_model_path, complement_model_path)
    template_model = get_model(config["stateMachineType"], template_model_path)
    complement_model = get_model(
        config["stateMachineType"],
        complement_model_path) if config["twoD"] else None

    # get the input HDP, if we're using it
    if config["stateMachineType"] == "threeStateHdp":
        template_hdp = working_folder.add_file_path(
            "%s" % config["templateHdp"].split("/")[-1])
        copyfile(config["templateHdp"], template_hdp)
        if config["twoD"]:
            complement_hdp = working_folder.add_file_path(
                "%s" % config["complementHdp"].split("/")[-1])
            copyfile(config["complementHdp"], complement_hdp)
        else:
            complement_hdp = None
    else:
        template_hdp = None
        complement_hdp = None

    # make some paths to files to hold the HMMs
    template_hmm = working_folder.add_file_path("template_trained.hmm")
    complement_hmm = working_folder.add_file_path("complement_trained.hmm")
    trained_models = [template_hmm, complement_hmm]
    untrained_models = [template_model_path, complement_model_path]

    for default_model, trained_model in zip(untrained_models, trained_models):
        assert os.path.exists(
            default_model), "Didn't find default model {}".format(
                default_model)
        copyfile(default_model, trained_model)
        assert os.path.exists(
            trained_model), "Problem copying default model to {}".format(
                trained_model)

    # start iterating
    i = 0
    while i < config["iterations"]:
        # first cull a set of files to get expectations on
        training_files = cull_training_files(
            samples=samples,
            training_amount=config["training_bases"],
            twoD=config["twoD"])
        # setup
        workers = config["job_count"]
        work_queue = Manager().Queue()
        done_queue = Manager().Queue()
        jobs = []

        # get expectations for all the files in the queue
        # file_ref_tuple should be (fast5, (plus_ref_seq, minus_ref_seq))
        for fast5, ref_map in training_files:
            alignment_args = {
                "reference_map": ref_map,
                "destination": working_folder_path,
                "stateMachineType": config["stateMachineType"],
                "bwa_index": bwa_ref_index,
                "in_templateHmm": template_hmm,
                "in_complementHmm": complement_hmm,
                "in_templateHdp": template_hdp,
                "in_complementHdp": complement_hdp,
                "in_fast5": fast5,
                "threshold": 0.01,
                "diagonal_expansion": config["diagonal_expansion"],
                "constraint_trim": config["constraint_trim"],
                "target_regions": None,
                "degenerate": None,
                "twoD_chemistry": config["twoD"],
            }
            if config["DEBUG"]:
                alignment = SignalAlignment(**alignment_args)
                alignment.run(get_expectations=True)
            else:
                work_queue.put(alignment_args)

        for w in xrange(workers):
            p = Process(target=get_expectations, args=(work_queue, done_queue))
            p.start()
            jobs.append(p)
            work_queue.put('STOP')

        for p in jobs:
            p.join()

        done_queue.put('STOP')

        # load then normalize the expectations
        template_expectations_files = [
            x for x in os.listdir(working_folder_path)
            if x.endswith(".template.expectations")
        ]

        complement_expectations_files = [
            x for x in os.listdir(working_folder_path)
            if x.endswith(".complement.expectations")
        ]

        if len(template_expectations_files) > 0:
            add_and_norm_expectations(path=working_folder_path,
                                      files=template_expectations_files,
                                      model=template_model,
                                      hmm_file=template_hmm,
                                      update_transitions=True)

        if config["twoD"] and len(complement_expectations_files) > 0:
            add_and_norm_expectations(path=working_folder_path,
                                      files=complement_expectations_files,
                                      model=complement_model,
                                      hmm_file=complement_hmm,
                                      update_transitions=True)

        # log the running likelihood
        if len(template_model.running_likelihoods) > 0 and \
                (config["twoD"] and len(complement_model.running_likelihoods)) > 0:
            print("{i}| {t_likelihood}\t{c_likelihood}".format(
                t_likelihood=template_model.running_likelihoods[-1],
                c_likelihood=complement_model.running_likelihoods[-1],
                i=i))
            if config["TEST"] and (len(template_model.running_likelihoods) >= 2) and \
                    (config["twoD"] and len(complement_model.running_likelihoods) >= 2):
                print("TESTING")
                assert (template_model.running_likelihoods[-2] < template_model.running_likelihoods[-1]) and \
                       (complement_model.running_likelihoods[-2] < complement_model.running_likelihoods[-1]), \
                    "Testing: Likelihood error, went up"
        i += 1

    # if we're using HDP, trim the final Hmm (remove assignments)

    print("trainModels - finished training routine", file=sys.stdout)
    print("trainModels - finished training routine", file=sys.stderr)
Exemple #24
0
def main(args):
    # parse args
    args = parse_args(args)

    command_line = " ".join(sys.argv[:])
    print("[singleNucleotideProbabilities] Command Line: {cmdLine}\n".format(
        cmdLine=command_line),
          file=sys.stderr)

    # first: see if we want to validate and return
    if args.validation_file is not None:
        if os.path.isfile(args.validation_file):
            validate_snp_file(args.validation_file,
                              args.ref,
                              print_sequences=True,
                              print_summary=True)
        elif os.path.isdir(args.validation_file):
            validate_snp_directory(args.validation_file,
                                   args.ref,
                                   print_summary=False,
                                   move_files=False,
                                   make_plots=True,
                                   alignment_file_location=args.alignment_file)
        else:
            print("[error] got invalid validation location: {}".format(
                args.validation_file))
        return 0

    # get absolute paths to inputs
    args.files_dir = resolvePath(args.files_dir)
    args.fast5_glob = resolvePath(args.fast5_glob)
    args.ref = resolvePath(args.ref)
    args.out = resolvePath(args.out)
    args.in_T_Hmm = resolvePath(args.in_T_Hmm)
    args.in_C_Hmm = resolvePath(args.in_C_Hmm)
    args.templateHDP = resolvePath(args.templateHDP)
    args.complementHDP = resolvePath(args.complementHDP)
    args.target_regions = resolvePath(args.target_regions)
    args.alignment_file = resolvePath(args.alignment_file)

    # assert integers
    args.step_size = int(args.step_size)
    args.kmer_size = int(args.kmer_size)

    # get input glob
    input_glob = args.fast5_glob if args.fast5_glob is not None else os.path.join(
        args.files_dir, "*.fast5")

    start_message = """
#   Single Nucleotide Probabilities
#
#   Aligning files matching: {inputGlob}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
#   Kmer size: {kmerSize}
#   Step size: {stepSize}
#   Alignment File: {alignmentFile}
    """.format(inputGlob=input_glob,
               reference=args.ref,
               banding=args.banded,
               nbFiles=args.nb_files,
               inThmm=args.in_T_Hmm,
               inChmm=args.in_C_Hmm,
               model=args.stateMachineType,
               regions=args.target_regions,
               tHdp=args.templateHDP,
               cHdp=args.complementHDP,
               kmerSize=args.kmer_size,
               stepSize=args.step_size,
               alignmentFile=args.alignment_file)
    print(start_message, file=sys.stdout)

    # prep
    if not os.path.isdir(args.out): os.mkdir(args.out)

    # get fast5 locations and prune
    fast5s = glob.glob(input_glob)
    if args.nb_files is not None and args.nb_files < len(fast5s):
        print(
            "[singleNucleotideProbabilities] pruning {} fast5 files down to configured max {}"
            .format(len(fast5s), args.nb_files))
        shuffle(fast5s)
        fast5s = fast5s[:args.nb_files]

    # get the (input) reference sequence
    if not os.path.isfile(args.ref):
        print(
            "[singleNucleotideProbabilities] Did not find valid reference file",
            file=sys.stderr)
        sys.exit(1)

    # make a working folder in the specified directory
    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(
        os.path.join(args.out, "tempFiles_errorCorrection"))

    # alignment args are the parameters to the HMM/HDP model, and don't change
    alignment_args = {
        # "path_to_EC_refs": None,
        "destination": temp_dir_path,
        "stateMachineType": args.stateMachineType,
        "bwa_reference": args.ref,
        "in_templateHmm": args.in_T_Hmm,
        "in_complementHmm": args.in_C_Hmm,
        "in_templateHdp": args.templateHDP,
        "in_complementHdp": args.complementHDP,
        "threshold": args.threshold,
        "diagonal_expansion": args.diag_expansion,
        "constraint_trim": args.constraint_trim,
        "target_regions": None,
        "degenerate": getDegenerateEnum("variant"),
        "alignment_file": args.alignment_file,
        'track_memory_usage': False,
        'get_expectations': False
    }

    # get the sites that have proposed edits
    print(
        "\n\n[singleNucleotideProbabilities] scanning for proposals with %d fast5s"
        % len(fast5s))
    output_files = discover_single_nucleotide_probabilities(
        args,
        temp_folder,
        args.kmer_size,
        args.ref,
        fast5s,
        alignment_args,
        args.nb_jobs,
        args.step_size,
        output_directory=args.out)
    print("\n[singleNucleotideProbabilities] got {} output files:".format(
        len(output_files)))
    i = 0
    for output_file in output_files:
        print("\t{}".format(output_file))
        i += 1
        if i > 10 and len(output_files) > 10:
            print("\t...")
            break

    #validation
    if len(output_files) != 0:
        validate_snp_directory(os.path.dirname(output_files[0]),
                               args.ref,
                               print_summary=True)

    print("\n\n[singleNucleotideProbabilities] fin\n")

    return 0
Exemple #25
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
          file=sys.stderr)

    start_message = """
#   Starting Jamison Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
#   Performing {cycles} cycles
    """.format(fileDir=args.files_dir,
               reference=args.ref,
               nbFiles=args.nb_files,
               banding=args.banded,
               inThmm=args.in_T_Hmm,
               inChmm=args.in_C_Hmm,
               model=args.stateMachineType,
               regions=args.target_regions,
               tHdp=args.templateHDP,
               cHdp=args.complementHDP,
               cycles=args.cycles)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out +
                                            "tempFiles_errorCorrection")

    # initialize to input fasta
    reference_sequence_path = args.ref

    # list of alignment files
    fast5s = cull_fast5_files(args.files_dir, args.nb_files)

    for cycle in range(0, args.cycles):
        # index the reference for bwa this is a string with the path to the index
        bwa_ref_index = get_bwa_index(reference_sequence_path, temp_dir_path)

        # unpack the reference sequence
        reference_sequence_string = get_first_sequence(reference_sequence_path)

        alignment_args = {
            "path_to_EC_refs": None,
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_index": bwa_ref_index,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "in_templateHdp": args.templateHDP,
            "in_complementHdp": args.complementHDP,
            "banded": args.banded,
            "sparse_output": True,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "target_regions": None,
            "degenerate": degenerate_enum(args.degenerate),
        }

        proposals = scan_for_proposals(temp_folder, STEP,
                                       reference_sequence_string, fast5s,
                                       alignment_args, args.nb_jobs)

        proposals = group_sites_in_window(proposals, 6)

        print("Cycle {cycle} - Got {nb} sites to check: {sites}".format(
            nb=len(proposals), sites=proposals, cycle=cycle))

        updated_reference_string = update_reference_with_marginal_probs(
            temp_folder, proposals, reference_sequence_string, fast5s,
            alignment_args, args.nb_jobs)

        updated_reference_path = temp_folder.add_file_path(
            "cycle_snapshot.{cycle}.fa".format(cycle=cycle))

        write_fasta("jamison{}".format(cycle), updated_reference_string,
                    open(updated_reference_path, 'w'))

        reference_sequence_path = updated_reference_path

    # copy final file
    copyfile(reference_sequence_path, temp_dir_path + args.corrected)

    return
Exemple #26
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
          file=sys.stderr)

    start_message = """
#   Starting BonnyDoon Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir,
               reference=args.ref,
               nbFiles=args.nb_files,
               banding=args.banded,
               inThmm=args.in_T_Hmm,
               inChmm=args.in_C_Hmm,
               model=args.stateMachineType,
               regions=args.target_regions,
               tHdp=args.templateHDP,
               cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)
    # cull the MinION files
    fast5s = cull_fast5_files(args.files_dir, args.nb_files)

    # get the (input) reference sequence
    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)
    reference_sequence_path = args.ref

    # unpack the reference sequence
    reference_sequence_string = get_first_sequence(reference_sequence_path)

    # make a working folder in the specified directory
    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out +
                                            "tempFiles_errorCorrection")

    # index the reference for bwa this is a string with the path to the index
    bwa_ref_index = get_bwa_index(reference_sequence_path, temp_dir_path)

    # alignment args are the parameters to the HMM/HDP model, and don't change
    alignment_args = {
        "path_to_EC_refs": None,
        "destination": temp_dir_path,
        "stateMachineType": args.stateMachineType,
        "bwa_index": bwa_ref_index,
        "in_templateHmm": args.in_T_Hmm,
        "in_complementHmm": args.in_C_Hmm,
        "in_templateHdp": args.templateHDP,
        "in_complementHdp": args.complementHDP,
        "banded": args.banded,
        "sparse_output": True,
        "threshold": args.threshold,
        "diagonal_expansion": args.diag_expansion,
        "constraint_trim": args.constraint_trim,
        "target_regions": None,
        "degenerate": degenerate_enum(args.degenerate),
    }

    # get the sites that have proposed edits
    proposals = scan_for_proposals(temp_folder, STEP,
                                   reference_sequence_string, fast5s,
                                   alignment_args, args.nb_jobs)
    proposals = group_sites_in_window2([x[0] for x in proposals], 6)

    return
class SignalAlignment(object):
    def __init__(
            self,
            in_fast5,
            destination,
            stateMachineType,
            in_templateHmm,
            in_complementHmm,
            in_templateHdp,
            in_complementHdp,
            threshold,
            diagonal_expansion,
            constraint_trim,
            degenerate,
            forward_reference,
            backward_reference=None,
            # one of these needs to be set
            alignment_file=None,
            bwa_reference=None,
            # reasonable defaults
            twoD_chemistry=False,
            target_regions=None,
            output_format="full",
            embed=False,
            event_table=False,
            check_for_temp_file_existance=True,
            track_memory_usage=False,
            get_expectations=False,
            path_to_bin=''):
        self.in_fast5 = in_fast5  # fast5 file to align
        self.destination = destination  # place where the alignments go, should already exist
        self.stateMachineType = stateMachineType  # flag for signalMachine
        self.bwa_reference = bwa_reference  # path to reference sequence to generate guide alignment
        self.threshold = threshold  # min posterior probability to keep
        self.diagonal_expansion = diagonal_expansion  # alignment algorithm param
        self.constraint_trim = constraint_trim  # alignment algorithm param
        self.output_format = output_format  # smaller output files
        self.degenerate = degenerate  # set of nucleotides for degenerate characters
        self.twoD_chemistry = twoD_chemistry  # flag for 2D sequencing runs
        self.temp_folder = FolderHandler(
        )  # object for holding temporary files (non-toil)
        self.read_name = self.in_fast5.split(
            "/")[-1][:-6]  # get the name without the '.fast5'
        self.target_regions = target_regions
        self.output_formats = {"full": 0, "variantCaller": 1, "assignments": 2}
        self.embed = embed  # embed the output into the fast5 file
        self.event_table = event_table  # specify which event table to use to generate alignments
        self.backward_reference = backward_reference  # fasta path to backward reference if modified bases are used
        self.forward_reference = forward_reference  # fasta path to forward reference
        self.alignment_file = alignment_file  # guide aligments will be gotten from here if set
        self.check_for_temp_file_existance = check_for_temp_file_existance  # don't recreate if files exist
        self.track_memory_usage = track_memory_usage  # has the 'time' program append mem usage stats to output
        self.max_memory_usage_kb = None
        self.read_label = None
        self.get_expectations = get_expectations  # option to gather expectations of transitions and emissions
        self.path_to_signalMachine = os.path.join(
            path_to_bin, "signalMachine")  # path to signalMachine

        assert os.path.exists(
            self.path_to_signalMachine), "Path to signalMachine does not exist"
        assert self.bwa_reference is not None or self.alignment_file is not None, \
            "either 'bwa_reference' or 'alignment_file' argument is needed to generate cigar strings"

        if (in_templateHmm is not None) and os.path.isfile(in_templateHmm):
            self.in_templateHmm = in_templateHmm
        else:
            self.in_templateHmm = None
        if (in_complementHmm is not None) and os.path.isfile(in_complementHmm):
            self.in_complementHmm = in_complementHmm
        else:
            self.in_complementHmm = None

        # similarly for HDPs
        if (in_templateHdp is not None) and os.path.isfile(in_templateHdp):
            self.in_templateHdp = in_templateHdp
        else:
            self.in_templateHdp = None
        if (in_complementHdp is not None) and os.path.isfile(in_complementHdp):
            self.in_complementHdp = in_complementHdp
        else:
            self.in_complementHdp = None
        assert os.path.exists(self.destination), \
            "Destination path does not exist: {}".format(self.destination)

    def run(self):
        print("[SignalAlignment.run] INFO: Starting on {read}".format(
            read=self.in_fast5))
        if self.get_expectations:
            assert self.in_templateHmm is not None, "Need template HMM files for model training"
            if self.twoD_chemistry:
                assert self.in_complementHmm is not None, "Need compement HMM files for model training"
        if not os.path.isfile(self.in_fast5):
            print("[SignalAlignment.run] ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        # prep
        self.openTempFolder("tempFiles_%s" % self.read_name)
        if self.twoD_chemistry:
            npRead = NanoporeRead2D(fast_five_file=self.in_fast5,
                                    event_table=self.event_table,
                                    initialize=True)
        else:
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  event_table=self.event_table,
                                  initialize=True)
        #todo need to validate / generate events and nucleotide read

        # read label
        read_label = npRead.read_label  # use this to identify the read throughout
        self.read_label = read_label

        # nanopore read (event table, etc)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        if not (self.check_for_temp_file_existance
                and os.path.isfile(npRead_)):
            # TODO is this totally f****d for RNA because of 3'-5' mapping?
            fH = open(npRead_, "w")
            ok = npRead.Write(out_file=fH, initialize=True)
            fH.close()
            if not ok:
                self.failStop(
                    "[SignalAlignment.run] File: %s did not pass initial checks"
                    % self.read_name, npRead)
                return False

        # nucleotide read
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        ok = self.write_nucleotide_read(npRead, read_fasta_)
        if not ok:
            print(
                "[SignalAlignment.run] Failed to write nucleotide read.  Continuing execution."
            )

        # alignment info
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        strand = None
        reference_name = None
        if not (self.check_for_temp_file_existance
                and os.path.isfile(cigar_file_)):

            # need guide alignment to generate cigar file
            guide_alignment = None

            # get from alignment file
            if self.alignment_file is not None:
                guide_alignment = getGuideAlignmentFromAlignmentFile(
                    self.alignment_file, read_name=read_label)
                if guide_alignment is None:
                    print(
                        "[SignalAlignment.run] read {} not found in {}".format(
                            read_label, self.alignment_file))

            # get from bwa
            if guide_alignment is None and self.bwa_reference is not None:
                guide_alignment = generateGuideAlignment(
                    reference_fasta=self.bwa_reference,
                    query=read_fasta_,
                    temp_sam_path=temp_samfile_,
                    target_regions=self.target_regions)
                if guide_alignment is None:
                    print(
                        "[SignalAlignment.run] read {} could not be aligned with BWA"
                        .format(read_label))

            # could not map
            if guide_alignment is None:
                self.failStop(
                    "[SignalAlignment.run] ERROR getting guide alignment",
                    npRead)
                return False

            # ensure valid
            if not guide_alignment.validate():
                self.failStop(
                    "[SignalAlignment.run] ERROR invalid guide alignment",
                    npRead)
                return False
            strand = guide_alignment.strand
            reference_name = guide_alignment.reference_name

            # write cigar to file
            cig_handle = open(cigar_file_, "w")
            cig_handle.write(guide_alignment.cigar + "\n")
            cig_handle.close()

        # otherwise, get strand from file
        else:
            strand, reference_name = getInfoFromCigarFile(cigar_file_)

        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        # forward strand
        if strand == "+":
            if self.output_format == "full":
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".forward.tsv")
            elif self.output_format == "variantCaller":
                posteriors_file_path = os.path.join(
                    self.destination, read_label + model_label + ".tsv")
            else:
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".assignments.tsv")

        # backward strand
        elif strand == "-":
            if self.output_format == "full":
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".backward.tsv")
            elif self.output_format == "variantCaller":
                posteriors_file_path = os.path.join(
                    self.destination, read_label + model_label + ".tsv")
            else:
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".assignments.tsv")

        # sanity check
        else:
            self.failStop(
                "[SignalAlignment.run] ERROR Unexpected strand {}".format(
                    strand), npRead)
            return False

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(
                strand="template", version=npRead.version)
        if self.twoD_chemistry and self.in_complementHmm is None:
            pop1_complement = npRead.complement_model_id == "complement_median68pA_pop1.model"
            self.in_complementHmm = defaultModelFromVersion(
                strand="complement",
                version=npRead.version,
                pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run] ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalAlignment.run] NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm))

        # reference sequences
        assert os.path.isfile(self.forward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference)
        if self.backward_reference:
            assert os.path.isfile(self.backward_reference)
            backward_ref_flag = "-b {b_ref} ".format(
                b_ref=self.backward_reference)
        else:
            backward_ref_flag = ""

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in list(self.output_formats.keys()):
            self.failStop(
                "[SignalAlignment.run] ERROR illegal output format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        # twoD flag
        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""

        # commands
        if self.get_expectations:
            template_expectations_file_path = os.path.join(
                self.destination, read_label + ".template.expectations.tsv")
            complement_expectations_file_path = os.path.join(
                self.destination, read_label + ".complement.expectations.tsv")
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=self.path_to_signalMachine, model=stateMachineType_flag,
                            cigarFile=cigar_file_,
                            npRead=npRead_, readLabel=read_label, td=twoD_flag,
                            templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                            complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                            c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)
        else:
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=self.path_to_signalMachine, model=stateMachineType_flag, sparse=out_fmt,
                            cigarFile=cigar_file_,
                            readLabel=read_label, npRead=npRead_, td=twoD_flag,
                            t_model=template_model_flag, c_model=complement_model_flag,
                            posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)

        # run
        print("[SignalAlignment.run] running command: ", command, end="\n")
        try:
            command = command.split()

            if self.track_memory_usage:
                mem_command = ['/usr/bin/time', '-f', '\\nDEBUG_MAX_MEM:%M\\n']
                print(
                    "[SignalAlignment.run] Prepending command to track mem usage: {}"
                    .format(mem_command))
                mem_command.extend(command)
                command = mem_command

            output = subprocess.check_output(command, stderr=subprocess.STDOUT)
            output = str(output).split("\\n")
            for line in output:
                print("[SignalAlignment.run]    {}: {}".format(
                    read_label, line))
                if line.startswith("DEBUG_MAX_MEM"):
                    self.max_memory_usage_kb = int(line.split(":")[1])

        except Exception as e:
            print(
                "[SignalAlignment.run] exception ({}) running signalAlign: {}".
                format(type(e), e))
            raise e

        # save to fast5 file (if appropriate)
        if self.embed:
            print("[SignalAlignment.run] embedding into Fast5 ")

            data = self.read_in_signal_align_tsv(posteriors_file_path,
                                                 file_type=self.output_format)
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  twoD=self.twoD_chemistry,
                                  event_table=self.event_table)
            npRead.Initialize(None)
            signal_align_path = npRead.get_latest_basecall_edition(
                "/Analyses/SignalAlign_00{}", new=False)
            assert signal_align_path, "There is no path in Fast5 file: {}".format(
                "/Analyses/SignalAlign_00{}")
            output_path = npRead._join_path(signal_align_path,
                                            self.output_format)
            npRead.write_data(data, output_path)

            # Todo add attributes to signalalign output
            if self.output_format == "full":
                print(
                    "[SignalAlignment.run] writing maximum expected alignment "
                )
                alignment = mea_alignment_from_signal_align(None, events=data)
                mae_path = npRead._join_path(signal_align_path,
                                             "MEA_alignment_labels")
                events = npRead.get_template_events()
                if events:
                    if strand == "-":
                        minus = True
                    else:
                        minus = False
                    labels = match_events_with_signalalign(
                        sa_events=alignment,
                        event_detections=np.asanyarray(npRead.template_events),
                        minus=minus,
                        rna=npRead.is_read_rna())
                    npRead.write_data(labels, mae_path)
                    sam_string = str()
                    if os.path.isfile(temp_samfile_):
                        with open(temp_samfile_, 'r') as test:
                            for line in test:
                                sam_string += line
                    sam_path = npRead._join_path(signal_align_path, "sam")
                    # print(sam_string)
                    npRead.write_data(data=sam_string,
                                      location=sam_path,
                                      compression=None)

        # self.temp_folder.remove_folder()
        return True

    def write_nucleotide_read(self, nanopore_read, file_path):
        try:
            with open(file_path, "w") as read_file:
                # get appropriate read
                if self.twoD_chemistry:
                    # check for table to make 'assembled' 2D alignment table fasta with
                    if not nanopore_read.has2D_alignment_table:
                        nanopore_read.close()
                        return False
                    nucleotide_read = nanopore_read.alignment_table_sequence
                else:
                    nucleotide_read = nanopore_read.template_read

                # write read
                fastaWrite(fileHandleOrFile=read_file,
                           name=nanopore_read.read_label,
                           seq=nucleotide_read)

            return True
        except Exception as e:
            print('[SignalAlignment.write_nucleotide_read] {} exception: {}'.
                  format(type(e), str(e)),
                  file=sys.stderr)
            return False

    def openTempFolder(self, temp_dir):
        self.temp_folder.open_folder(os.path.join(self.destination, temp_dir))

    def addTempFilePath(self, path_to_add):
        return self.temp_folder.add_file_path(path_to_add)

    def failStop(self, message, nanopore_read=None):
        self.temp_folder.remove_folder()
        if nanopore_read is not None:
            nanopore_read.close()
        print(message)

    def read_in_signal_align_tsv(self, tsv_path, file_type):
        """Read in tsv file"""
        assert file_type in ("full", "assignments", "variantCaller")
        with open(tsv_path, 'r') as tsvin:
            if file_type == "full":
                dtype = [('contig', 'S10'), ('reference_index', int),
                         ('reference_kmer', 'S5'), ('read_file', 'S57'),
                         ('strand', 'S1'), ('event_index', int),
                         ('event_mean', float), ('event_noise', float),
                         ('event_duration', float), ('aligned_kmer', 'S5'),
                         ('scaled_mean_current', float),
                         ('scaled_noise', float),
                         ('posterior_probability', float),
                         ('descaled_event_mean', float),
                         ('ont_model_mean', float), ('path_kmer', 'S5')]
            elif file_type == "assignments":
                dtype = [('k-mer', 'S10'), ('read_file', 'S57'),
                         ('descaled_event_mean', float),
                         ('posterior_probability', float)]

            else:
                dtype = [('event_index', int), ('reference_position', int),
                         ('base', 'S6'), ('posterior_probability', float),
                         ('strand', 'S1'), ('forward_mapped', int),
                         ('read_file', 'S57')]

            event_table = np.loadtxt(tsvin, dtype=dtype)

            def remove_field_name(a, name):
                names = list(a.dtype.names)
                if name in names:
                    names.remove(name)
                b = a[names]
                return b

            event_table = remove_field_name(event_table, "read_file")

        return event_table
Exemple #28
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line), file=sys.stderr)

    # get absolute paths to inputs
    args.files_dir           = resolvePath(args.files_dir)
    args.ref                 = resolvePath(args.ref)
    args.out                 = resolvePath(args.out)
    args.bwt                 = resolvePath(args.bwt)
    args.in_T_Hmm            = resolvePath(args.in_T_Hmm)
    args.in_C_Hmm            = resolvePath(args.in_C_Hmm)
    args.templateHDP         = resolvePath(args.templateHDP)
    args.complementHDP       = resolvePath(args.complementHDP)
    args.fofn                = resolvePath(args.fofn)
    args.target_regions      = resolvePath(args.target_regions)
    args.ambiguity_positions = resolvePath(args.ambiguity_positions)
    start_message = """
#   Starting Signal Align
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: True
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir, reference=args.ref, nbFiles=args.nb_files,
               inThmm=args.in_T_Hmm, inChmm=args.in_C_Hmm, model=args.stateMachineType, regions=args.target_regions,
               tHdp=args.templateHDP, cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if args.files_dir is None and args.fofn is None:
        print("Need to provide directory with .fast5 files of fofn", file=sys.stderr)
        sys.exit(1)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file, looked for it {here}".format(here=args.ref), file=sys.stderr)
        sys.exit(1)

    # make directory to put temporary files
    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out + "/tempFiles_alignment")
    #
    forward_reference, backward_reference = processReferenceFasta(fasta=args.ref,
                                                                      motif_key=args.motif_key,
                                                                      work_folder=temp_folder,
                                                                      sub_char=args.ambig_char,
                                                                      positions_file=args.ambiguity_positions)

    # index the reference for bwa
    if args.bwt is not None:
        print("[RunSignalAlign]NOTICE - using provided BWT %s" % args.bwt)
        bwa_ref_index = args.bwt
    else:
        print("signalAlign - indexing reference", file=sys.stderr)
        bwa_ref_index = getBwaIndex(args.ref, temp_dir_path)
        print("signalAlign - indexing reference, done", file=sys.stderr)

    # setup workers for multiprocessing
    workers = args.nb_jobs
    work_queue = Manager().Queue()
    done_queue = Manager().Queue()
    jobs = []

    # list of read files
    if args.fofn is not None:
        fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")]
    else:
        fast5s = ["/".join([args.files_dir, x]) for x in os.listdir(args.files_dir) if x.endswith(".fast5")]

    nb_files = args.nb_files
    if nb_files < len(fast5s):
        shuffle(fast5s)
        fast5s = fast5s[:nb_files]

    # change paths to the source directory
    os.chdir(signalAlignSourceDir())

    print("[runSignalAlign]:NOTICE: Got {} files to align".format(len(fast5s)), file=sys.stdout)
    for fast5 in fast5s:
        print(fast5)
        alignment_args = {
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_index": bwa_ref_index,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "in_templateHdp": args.templateHDP,
            "in_complementHdp": args.complementHDP,
            "output_format": args.outFmt,
            "in_fast5": fast5,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "degenerate": getDegenerateEnum(args.degenerate),
            "twoD_chemistry": args.twoD,
            "target_regions": args.target_regions,
            "embed": args.embed,
            "event_table": args.event_table,
            "backward_reference": backward_reference,
            "forward_reference": forward_reference
        }
        if args.DEBUG:
            alignment = SignalAlignment(**alignment_args)
            alignment.run()
        else:
            work_queue.put(alignment_args)

    for w in range(workers):
        p = Process(target=aligner, args=(work_queue, done_queue))
        p.start()
        jobs.append(p)
        work_queue.put('STOP')

    for p in jobs:
        p.join()

    done_queue.put('STOP')
    print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
    print("\n#  signalAlign - finished alignments\n", file=sys.stdout)
Exemple #29
0
def main(args):
    # parse args
    start = timer()

    args = parse_args()
    if args.command == "run":
        if not os.path.exists(args.config):
            print("{config} not found".format(config=args.config))
            exit(1)
        # run training
        config_args = create_dot_dict(load_json(args.config))

        temp_folder = FolderHandler()
        temp_dir_path = temp_folder.open_folder(
            os.path.join(os.path.abspath(config_args.output_dir),
                         "tempFiles_alignment"))
        temp_dir_path = resolvePath(temp_dir_path)
        print(config_args.output_dir)
        print(temp_dir_path)

        sa_args = [
            merge_dicts([
                s, {
                    "quality_threshold": config_args.filter_reads,
                    "workers": config_args.job_count
                }
            ]) for s in config_args.samples
        ]

        samples = [
            SignalAlignSample(working_folder=temp_folder, **s) for s in sa_args
        ]
        copyfile(args.config,
                 os.path.join(temp_dir_path, os.path.basename(args.config)))

        state_machine_type = "threeState"
        if config_args.template_hdp_model_path is not None:
            state_machine_type = "threeStateHdp"

        alignment_args = create_signalAlignment_args(
            destination=temp_dir_path,
            stateMachineType=state_machine_type,
            in_templateHmm=resolvePath(config_args.template_hmm_model),
            in_complementHmm=resolvePath(config_args.complement_hmm_model),
            in_templateHdp=resolvePath(config_args.template_hdp_model),
            in_complementHdp=resolvePath(config_args.complement_hdp_model),
            diagonal_expansion=config_args.diagonal_expansion,
            constraint_trim=config_args.constraint_trim,
            traceBackDiagonals=config_args.traceBackDiagonals,
            twoD_chemistry=config_args.two_d,
            get_expectations=False,
            path_to_bin=resolvePath(config_args.path_to_bin),
            check_for_temp_file_existance=True,
            threshold=config_args.signal_alignment_args.threshold,
            track_memory_usage=config_args.signal_alignment_args.
            track_memory_usage,
            embed=config_args.signal_alignment_args.embed,
            event_table=config_args.signal_alignment_args.event_table,
            output_format=config_args.signal_alignment_args.output_format,
            filter_reads=config_args.filter_reads,
            delete_tmp=config_args.signal_alignment_args.delete_tmp)

        multithread_signal_alignment_samples(samples,
                                             alignment_args,
                                             config_args.job_count,
                                             trim=None,
                                             debug=config_args.debug)

        print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
        print("\n#  signalAlign - finished alignments\n", file=sys.stdout)
        stop = timer()
    else:
        command_line = " ".join(sys.argv[:])
        print(os.getcwd())

        print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
              file=sys.stderr)
        # get absolute paths to inputs
        args.files_dir = resolvePath(args.files_dir)
        args.forward_reference = resolvePath(args.forward_ref)
        args.backward_reference = resolvePath(args.backward_ref)
        args.out = resolvePath(args.out)
        args.bwa_reference = resolvePath(args.bwa_reference)
        args.in_T_Hmm = resolvePath(args.in_T_Hmm)
        args.in_C_Hmm = resolvePath(args.in_C_Hmm)
        args.templateHDP = resolvePath(args.templateHDP)
        args.complementHDP = resolvePath(args.complementHDP)
        args.fofn = resolvePath(args.fofn)
        args.target_regions = resolvePath(args.target_regions)
        args.ambiguity_positions = resolvePath(args.ambiguity_positions)
        args.alignment_file = resolvePath(args.alignment_file)
        start_message = """
    #   Starting Signal Align
    #   Aligning files from: {fileDir}
    #   Aligning to reference: {reference}
    #   Aligning maximum of {nbFiles} files
    #   Using model: {model}
    #   Using banding: True
    #   Aligning to regions in: {regions}
    #   Non-default template HMM: {inThmm}
    #   Non-default complement HMM: {inChmm}
    #   Template HDP: {tHdp}
    #   Complement HDP: {cHdp}
        """.format(fileDir=args.files_dir,
                   reference=args.bwa_reference,
                   nbFiles=args.nb_files,
                   inThmm=args.in_T_Hmm,
                   inChmm=args.in_C_Hmm,
                   model=args.stateMachineType,
                   regions=args.target_regions,
                   tHdp=args.templateHDP,
                   cHdp=args.complementHDP)

        print(start_message, file=sys.stdout)

        if args.files_dir is None and args.fofn is None:
            print("Need to provide directory with .fast5 files of fofn",
                  file=sys.stderr)
            sys.exit(1)

        if not os.path.isfile(args.bwa_reference):
            print("Did not find valid reference file, looked for it {here}".
                  format(here=args.bwa_reference),
                  file=sys.stderr)
            sys.exit(1)

        # make directory to put temporary files
        if not os.path.isdir(args.out):
            print("Creating output directory: {}".format(args.out),
                  file=sys.stdout)
            os.mkdir(args.out)
        temp_folder = FolderHandler()
        temp_dir_path = temp_folder.open_folder(
            os.path.join(os.path.abspath(args.out), "tempFiles_alignment"))
        temp_dir_path = resolvePath(temp_dir_path)
        print(args.out)
        print(temp_dir_path)

        # generate reference sequence if not specified
        if not args.forward_reference or not args.backward_reference:
            args.forward_reference, args.backward_reference = processReferenceFasta(
                fasta=args.bwa_reference,
                work_folder=temp_folder,
                positions_file=args.ambiguity_positions,
                name="")

        # list of read files
        if args.fofn is not None:
            fast5s = [x for x in parseFofn(args.fofn) if x.endswith(".fast5")]
        else:
            fast5s = [
                "/".join([args.files_dir, x])
                for x in os.listdir(args.files_dir) if x.endswith(".fast5")
            ]

        nb_files = args.nb_files
        if nb_files < len(fast5s):
            shuffle(fast5s)
            fast5s = fast5s[:nb_files]

        # return alignment_args
        alignment_args = {
            "destination": temp_dir_path,
            "stateMachineType": args.stateMachineType,
            "bwa_reference": args.bwa_reference,
            "in_templateHmm": args.in_T_Hmm,
            "in_complementHmm": args.in_C_Hmm,
            "in_templateHdp": args.templateHDP,
            "in_complementHdp": args.complementHDP,
            "output_format": args.outFmt,
            "threshold": args.threshold,
            "diagonal_expansion": args.diag_expansion,
            "constraint_trim": args.constraint_trim,
            "degenerate": getDegenerateEnum(args.degenerate),
            "twoD_chemistry": args.twoD,
            "target_regions": args.target_regions,
            "embed": args.embed,
            "event_table": args.event_table,
            "backward_reference": args.backward_reference,
            "forward_reference": args.forward_reference,
            "alignment_file": args.alignment_file,
            "check_for_temp_file_existance": True,
            "track_memory_usage": False,
            "get_expectations": False,
            "perform_kmer_event_alignment": args.perform_kmer_event_alignment,
            "enforce_supported_versions": args.enforce_supported_versions,
            "filter_reads": 7 if args.filter_reads else None,
            "path_to_bin": args.path_to_bin,
            "delete_tmp": args.delete_tmp
        }
        filter_read_generator = None
        if args.filter_reads is not None and args.alignment_file and args.readdb and args.files_dir:
            print("[runSignalAlign]:NOTICE: Filtering out low quality reads",
                  file=sys.stdout)

            filter_read_generator = filter_reads_to_string_wrapper(
                filter_reads(args.alignment_file,
                             args.readdb, [args.files_dir],
                             quality_threshold=7,
                             recursive=args.recursive))

        print("[runSignalAlign]:NOTICE: Got {} files to align".format(
            len(fast5s)),
              file=sys.stdout)
        # setup workers for multiprocessing
        multithread_signal_alignment(
            alignment_args,
            fast5s,
            args.nb_jobs,
            debug=args.DEBUG,
            filter_reads_to_string_wrapper=filter_read_generator)
        stop = timer()

        print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
        print("\n#  signalAlign - finished alignments\n", file=sys.stdout)

    print("[signalAlign] Complete")
    print("Running Time = {} seconds".format(stop - start))
Exemple #30
0
def main(args):
    # parse args
    args = parse_args()

    command_line = " ".join(sys.argv[:])
    print("Command Line: {cmdLine}\n".format(cmdLine=command_line),
          file=sys.stderr)

    start_message = """
#   Starting Empire Error-Correction
#   Aligning files from: {fileDir}
#   Aligning to reference: {reference}
#   Aligning maximum of {nbFiles} files
#   Using model: {model}
#   Using banding: {banding}
#   Aligning to regions in: {regions}
#   Non-default template HMM: {inThmm}
#   Non-default complement HMM: {inChmm}
#   Template HDP: {tHdp}
#   Complement HDP: {cHdp}
    """.format(fileDir=args.files_dir,
               reference=args.ref,
               nbFiles=args.nb_files,
               banding=args.banded,
               inThmm=args.in_T_Hmm,
               inChmm=args.in_C_Hmm,
               model=args.stateMachineType,
               regions=args.target_regions,
               tHdp=args.templateHDP,
               cHdp=args.complementHDP)

    print(start_message, file=sys.stdout)

    if not os.path.isfile(args.ref):
        print("Did not find valid reference file", file=sys.stderr)
        sys.exit(1)

    temp_folder = FolderHandler()
    temp_dir_path = temp_folder.open_folder(args.out +
                                            "tempFiles_errorCorrection")

    reference_sequence = args.ref

    for cycle in range(0, args.cycles):
        check, reference_sequence_length = write_degenerate_reference_set(
            input_fasta=reference_sequence, out_path=temp_dir_path, step=STEP)
        assert check, "Problem making degenerate reference sequence set"

        # index the reference for bwa
        print("signalAlign - indexing reference", file=sys.stderr)
        bwa_ref_index = get_bwa_index(reference_sequence, temp_dir_path)
        print("signalAlign - indexing reference, done", file=sys.stderr)

        # setup workers for multiprocessing
        workers = args.nb_jobs
        work_queue = Manager().Queue()
        done_queue = Manager().Queue()
        jobs = []

        # list of alignment files
        fast5s = [
            x for x in os.listdir(args.files_dir) if x.endswith(".fast5")
        ]

        # take only some
        if args.nb_files < len(fast5s):
            shuffle(fast5s)
            fast5s = fast5s[:args.nb_files]

        for fast5 in fast5s:
            alignment_args = {
                "forward_reference": None,
                "backward_reference": None,
                "path_to_EC_refs": temp_dir_path,
                "destination": temp_dir_path,
                "stateMachineType": args.stateMachineType,
                "bwa_index": bwa_ref_index,
                "in_templateHmm": args.in_T_Hmm,
                "in_complementHmm": args.in_C_Hmm,
                "in_templateHdp": args.templateHDP,
                "in_complementHdp": args.complementHDP,
                "banded": args.banded,
                "sparse_output": True,
                "in_fast5": args.files_dir + fast5,
                "threshold": args.threshold,
                "diagonal_expansion": args.diag_expansion,
                "constraint_trim": args.constraint_trim,
                "target_regions": None,
                "degenerate": degenerate_enum(args.degenerate),
            }
            #alignment = SignalAlignment(**alignment_args)
            #alignment.run()
            work_queue.put(alignment_args)

        for w in range(workers):
            p = Process(target=aligner, args=(work_queue, done_queue))
            p.start()
            jobs.append(p)
            work_queue.put('STOP')

        for p in jobs:
            p.join()

        done_queue.put('STOP')

        print("\n#  signalAlign - finished alignments\n", file=sys.stderr)
        print("\n#  signalAlign - finished alignments\n", file=sys.stdout)

        # working sequence is a string, that has the reference we're going to update this cycle
        working_sequence = get_first_sequence(reference_sequence)

        # register is the relative position that is being N-ed:
        # ACGTAGACAATA --> NCGTAGNCAATA = register 0
        # ACGTAGACAATA --> ANGTAGANAATA = register 1 ...
        for register in range(0, STEP):
            print("#  Starting Variant Calling, register: {}...".format(
                register),
                  file=sys.stdout,
                  end='\n')
            print("#  Starting Variant Calling, register: {}...".format(
                register),
                  file=sys.stderr,
                  end='')
            # cull the alignment files for this register
            alns, forward_mask = get_alignments_labels_and_mask(
                path_to_alignments=temp_dir_path + "*.tsv.{}".format(register),
                max=args.nb_files,
                suffix=".{}".format(register))
            # this is the list of positions that we're going to look at, based on this register
            degenerate_positions = {
                'forward':
                list(range(register, reference_sequence_length, STEP)),
                'backward':
                list(range(register, reference_sequence_length, STEP))
            }

            # place to put the marginal probs
            variant_call_file = temp_folder.add_file_path(
                "variants.{cycle}.{reg}.calls".format(cycle=cycle,
                                                      reg=register))
            # arguments for multiprocessing
            for aln, forward_bool in zip(alns, forward_mask):
                call_methyl_args = {
                    "sequence": None,
                    "alignment_file": aln,
                    "forward": forward_bool,
                    "out_file": variant_call_file,
                    "positions": degenerate_positions,
                    "degenerate_type": degenerate_enum(args.degenerate),
                }
                #c = CallMethylation(**call_methyl_args)
                #c.write()
                work_queue.put(call_methyl_args)

            for w in range(workers):
                p = Process(target=run_methyl_caller,
                            args=(work_queue, done_queue))
                p.start()
                jobs.append(p)
                work_queue.put('STOP')

            for p in jobs:
                p.join()

            done_queue.put('STOP')

            # this is where the per-register update happens
            working_sequence = update_reference(variant_call_file,
                                                working_sequence,
                                                register,
                                                min_depth=0,
                                                get_sites=False)

            # remove alignments for this register
            for f in glob.glob(temp_dir_path + "*.tsv.{}".format(register)):
                os.remove(f)
            print("done", file=sys.stdout, end="\n")
            print("done", file=sys.stderr, end="\n")

        # add a file for this cycle
        ref_path = temp_folder.add_file_path(
            "iteration.{cycle}.fa".format(cycle=cycle))
        # write it to a file
        write_fasta("iteration.{cycle}.fa".format(cycle=cycle),
                    working_sequence, open(ref_path, 'w'))
        # update the path to the reference for the next cycle
        reference_sequence = ref_path
    return