Ejemplo n.º 1
0
    def run(self):
        print("[SignalAlignment.run] INFO: Starting on {read}".format(
            read=self.in_fast5))
        if self.get_expectations:
            assert self.in_templateHmm is not None, "Need template HMM files for model training"
            if self.twoD_chemistry:
                assert self.in_complementHmm is not None, "Need compement HMM files for model training"
        if not os.path.isfile(self.in_fast5):
            print("[SignalAlignment.run] ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        # prep
        self.openTempFolder("tempFiles_%s" % self.read_name)
        if self.twoD_chemistry:
            npRead = NanoporeRead2D(fast_five_file=self.in_fast5,
                                    event_table=self.event_table,
                                    initialize=True)
        else:
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  event_table=self.event_table,
                                  initialize=True)
        #todo need to validate / generate events and nucleotide read

        # read label
        read_label = npRead.read_label  # use this to identify the read throughout
        self.read_label = read_label

        # nanopore read (event table, etc)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        if not (self.check_for_temp_file_existance
                and os.path.isfile(npRead_)):
            # TODO is this totally f****d for RNA because of 3'-5' mapping?
            fH = open(npRead_, "w")
            ok = npRead.Write(out_file=fH, initialize=True)
            fH.close()
            if not ok:
                self.failStop(
                    "[SignalAlignment.run] File: %s did not pass initial checks"
                    % self.read_name, npRead)
                return False

        # nucleotide read
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        ok = self.write_nucleotide_read(npRead, read_fasta_)
        if not ok:
            print(
                "[SignalAlignment.run] Failed to write nucleotide read.  Continuing execution."
            )

        # alignment info
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        strand = None
        reference_name = None
        if not (self.check_for_temp_file_existance
                and os.path.isfile(cigar_file_)):

            # need guide alignment to generate cigar file
            guide_alignment = None

            # get from alignment file
            if self.alignment_file is not None:
                guide_alignment = getGuideAlignmentFromAlignmentFile(
                    self.alignment_file, read_name=read_label)
                if guide_alignment is None:
                    print(
                        "[SignalAlignment.run] read {} not found in {}".format(
                            read_label, self.alignment_file))

            # get from bwa
            if guide_alignment is None and self.bwa_reference is not None:
                guide_alignment = generateGuideAlignment(
                    reference_fasta=self.bwa_reference,
                    query=read_fasta_,
                    temp_sam_path=temp_samfile_,
                    target_regions=self.target_regions)
                if guide_alignment is None:
                    print(
                        "[SignalAlignment.run] read {} could not be aligned with BWA"
                        .format(read_label))

            # could not map
            if guide_alignment is None:
                self.failStop(
                    "[SignalAlignment.run] ERROR getting guide alignment",
                    npRead)
                return False

            # ensure valid
            if not guide_alignment.validate():
                self.failStop(
                    "[SignalAlignment.run] ERROR invalid guide alignment",
                    npRead)
                return False
            strand = guide_alignment.strand
            reference_name = guide_alignment.reference_name

            # write cigar to file
            cig_handle = open(cigar_file_, "w")
            cig_handle.write(guide_alignment.cigar + "\n")
            cig_handle.close()

        # otherwise, get strand from file
        else:
            strand, reference_name = getInfoFromCigarFile(cigar_file_)

        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        # forward strand
        if strand == "+":
            if self.output_format == "full":
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".forward.tsv")
            elif self.output_format == "variantCaller":
                posteriors_file_path = os.path.join(
                    self.destination, read_label + model_label + ".tsv")
            else:
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".assignments.tsv")

        # backward strand
        elif strand == "-":
            if self.output_format == "full":
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".backward.tsv")
            elif self.output_format == "variantCaller":
                posteriors_file_path = os.path.join(
                    self.destination, read_label + model_label + ".tsv")
            else:
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".assignments.tsv")

        # sanity check
        else:
            self.failStop(
                "[SignalAlignment.run] ERROR Unexpected strand {}".format(
                    strand), npRead)
            return False

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(
                strand="template", version=npRead.version)
        if self.twoD_chemistry and self.in_complementHmm is None:
            pop1_complement = npRead.complement_model_id == "complement_median68pA_pop1.model"
            self.in_complementHmm = defaultModelFromVersion(
                strand="complement",
                version=npRead.version,
                pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run] ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalAlignment.run] NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm))

        # reference sequences
        assert os.path.isfile(self.forward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference)
        if self.backward_reference:
            assert os.path.isfile(self.backward_reference)
            backward_ref_flag = "-b {b_ref} ".format(
                b_ref=self.backward_reference)
        else:
            backward_ref_flag = ""

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in list(self.output_formats.keys()):
            self.failStop(
                "[SignalAlignment.run] ERROR illegal output format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        # twoD flag
        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""

        # commands
        if self.get_expectations:
            template_expectations_file_path = os.path.join(
                self.destination, read_label + ".template.expectations.tsv")
            complement_expectations_file_path = os.path.join(
                self.destination, read_label + ".complement.expectations.tsv")
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=self.path_to_signalMachine, model=stateMachineType_flag,
                            cigarFile=cigar_file_,
                            npRead=npRead_, readLabel=read_label, td=twoD_flag,
                            templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                            complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                            c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)
        else:
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=self.path_to_signalMachine, model=stateMachineType_flag, sparse=out_fmt,
                            cigarFile=cigar_file_,
                            readLabel=read_label, npRead=npRead_, td=twoD_flag,
                            t_model=template_model_flag, c_model=complement_model_flag,
                            posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)

        # run
        print("[SignalAlignment.run] running command: ", command, end="\n")
        try:
            command = command.split()

            if self.track_memory_usage:
                mem_command = ['/usr/bin/time', '-f', '\\nDEBUG_MAX_MEM:%M\\n']
                print(
                    "[SignalAlignment.run] Prepending command to track mem usage: {}"
                    .format(mem_command))
                mem_command.extend(command)
                command = mem_command

            output = subprocess.check_output(command, stderr=subprocess.STDOUT)
            output = str(output).split("\\n")
            for line in output:
                print("[SignalAlignment.run]    {}: {}".format(
                    read_label, line))
                if line.startswith("DEBUG_MAX_MEM"):
                    self.max_memory_usage_kb = int(line.split(":")[1])

        except Exception as e:
            print(
                "[SignalAlignment.run] exception ({}) running signalAlign: {}".
                format(type(e), e))
            raise e

        # save to fast5 file (if appropriate)
        if self.embed:
            print("[SignalAlignment.run] embedding into Fast5 ")

            data = self.read_in_signal_align_tsv(posteriors_file_path,
                                                 file_type=self.output_format)
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  twoD=self.twoD_chemistry,
                                  event_table=self.event_table)
            npRead.Initialize(None)
            signal_align_path = npRead.get_latest_basecall_edition(
                "/Analyses/SignalAlign_00{}", new=False)
            assert signal_align_path, "There is no path in Fast5 file: {}".format(
                "/Analyses/SignalAlign_00{}")
            output_path = npRead._join_path(signal_align_path,
                                            self.output_format)
            npRead.write_data(data, output_path)

            # Todo add attributes to signalalign output
            if self.output_format == "full":
                print(
                    "[SignalAlignment.run] writing maximum expected alignment "
                )
                alignment = mea_alignment_from_signal_align(None, events=data)
                mae_path = npRead._join_path(signal_align_path,
                                             "MEA_alignment_labels")
                events = npRead.get_template_events()
                if events:
                    if strand == "-":
                        minus = True
                    else:
                        minus = False
                    labels = match_events_with_signalalign(
                        sa_events=alignment,
                        event_detections=np.asanyarray(npRead.template_events),
                        minus=minus,
                        rna=npRead.is_read_rna())
                    npRead.write_data(labels, mae_path)
                    sam_string = str()
                    if os.path.isfile(temp_samfile_):
                        with open(temp_samfile_, 'r') as test:
                            for line in test:
                                sam_string += line
                    sam_path = npRead._join_path(signal_align_path, "sam")
                    # print(sam_string)
                    npRead.write_data(data=sam_string,
                                      location=sam_path,
                                      compression=None)

        # self.temp_folder.remove_folder()
        return True
Ejemplo n.º 2
0
    def run(self, get_expectations=False):
        print("[SignalAlignment.run]INFO: Starting on {read}".format(
            read=self.in_fast5),
              file=sys.stderr)
        if get_expectations:
            assert self.in_templateHmm is not None and self.in_complementHmm is not None, \
                "Need HMM files for model training"
        # file checks
        if os.path.isfile(self.in_fast5) is False:
            print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        self.openTempFolder("tempFiles_%s" % self.read_name)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        # TODO is this totally f****d for RNA because of 3'-5' mapping?
        npRead = NanoporeRead(fast_five_file=self.in_fast5,
                              twoD=self.twoD_chemistry,
                              event_table=self.event_table)
        fH = open(npRead_, "w")
        ok = npRead.Write(parent_job=None, out_file=fH, initialize=True)
        fH.close()
        if not ok:
            self.failStop(
                "[SignalAlignment.run]File: %s did not pass initial checks" %
                self.read_name, npRead)
            return False

        read_label = npRead.read_label  # use this to identify the read throughout
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        if self.twoD_chemistry:
            ok, version, pop1_complement = self.prepare_twod(
                nanopore_read=npRead, twod_read_path=read_fasta_)
        else:
            ok, version, _ = self.prepare_oned(nanopore_read=npRead,
                                               oned_read_path=read_fasta_)
            pop1_complement = None
        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        guide_alignment = generateGuideAlignment(
            bwa_index=self.bwa_index,
            query=read_fasta_,
            temp_sam_path=temp_samfile_,
            target_regions=self.target_regions)
        # ok = guide_alignment.validate(list(self.reference_map.keys()))
        ok = guide_alignment.validate()

        if not ok:
            self.failStop("[SignalAlignment.run]ERROR getting guide alignment",
                          npRead)
            return False

        cig_handle = open(cigar_file_, "w")
        cig_handle.write(guide_alignment.cigar + "\n")
        cig_handle.close()

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        posteriors_file_path = ''
        # forward strand
        if guide_alignment.strand == "+":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # backward strand
        if guide_alignment.strand == "-":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # Alignment/Expectations routine
        path_to_signalAlign = "./signalMachine"

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(strand="template",
                                                          version=version)
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.in_complementHmm = defaultModelFromVersion(
                    strand="complement",
                    version=version,
                    pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalALignment.run]NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm),
            file=sys.stderr)

        # reference sequences
        assert os.path.isfile(self.forward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference)
        if self.backward_reference:
            assert os.path.isfile(self.backward_reference)
            backward_ref_flag = "-b {b_ref} ".format(
                b_ref=self.backward_reference)
        else:
            backward_ref_flag = ""

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in list(self.output_formats.keys()):
            self.failStop(
                "[SignalAlignment.run]ERROR illegal output format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""
        # commands
        if get_expectations:
            template_expectations_file_path = self.destination + read_label + ".template.expectations"
            complement_expectations_file_path = self.destination + read_label + ".complement.expectations"

            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=path_to_signalAlign, model=stateMachineType_flag,
                            cigarFile=cigar_file_,
                            npRead=npRead_, readLabel=read_label, td=twoD_flag,
                            templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                            complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                            c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=guide_alignment.reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)
        else:
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt,
                            cigarFile=cigar_file_,
                            readLabel=read_label, npRead=npRead_, td=twoD_flag,
                            t_model=template_model_flag, c_model=complement_model_flag,
                            posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=guide_alignment.reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)

        # run
        print("signalAlign - running command: ",
              command,
              end="\n",
              file=sys.stderr)
        os.system(command)
        if self.embed:
            print("signalAlign - embedding into Fast5 ", file=sys.stderr)

            data = self.read_in_signal_align_tsv(posteriors_file_path,
                                                 file_type=self.output_format)
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  twoD=self.twoD_chemistry,
                                  event_table=self.event_table)
            npRead.Initialize(None)
            signal_align_path = npRead.get_latest_basecall_edition(
                "/Analyses/SignalAlign_00{}", new=False)
            assert signal_align_path, "There is no path in Fast5 file: {}".format(
                "/Analyses/SignalAlign_00{}")
            output_path = npRead._join_path(signal_align_path,
                                            self.output_format)
            npRead.write_data(data, output_path)

            # Todo add attributes to signalalign output
            if self.output_format == "full":
                print("signalAlign - writing maximum expected alignment ",
                      file=sys.stderr)
                alignment = mea_alignment_from_signal_align(None, events=data)
                mae_path = npRead._join_path(signal_align_path,
                                             "MEA_alignment_labels")
                events = npRead.get_template_events()
                if events:
                    if guide_alignment.strand == "-":
                        minus = True
                    else:
                        minus = False
                    labels = match_events_with_signalalign(
                        sa_events=alignment,
                        event_detections=np.asanyarray(npRead.template_events),
                        minus=minus,
                        rna=npRead.is_read_rna())
                    npRead.write_data(labels, mae_path)
                    sam_string = str()
                    with open(temp_samfile_, 'r') as test:
                        for line in test:
                            sam_string += line
                    sam_path = npRead._join_path(signal_align_path, "sam")
                    # print(sam_string)
                    npRead.write_data(data=sam_string,
                                      location=sam_path,
                                      compression=None)

        # self.temp_folder.remove_folder()
        return True