Python NanoporeRead Examples

Programming Language: Python

Namespace/Package Name: signalalign.nanoporeRead

Class/Type: NanoporeRead

Examples at hotexamples.com: 16

Python NanoporeRead - 16 examples found. These are the top rated real world Python examples of signalalign.nanoporeRead.NanoporeRead extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

NanoporeRead(12)

_initialize_metadata(5)

Initialize(4)

Write(4)

bytes_to_string(4)

close(4)

_join_path(2)

get_latest_basecall_edition(2)

get_template_events(2)

is_read_rna(2)

write_data(2)

Example #1

Show file

File: ledger.py Project: wgosal/signalAlign

 def makeNanoporeRead(f5_path):
     # here we load the NanoporeRead and write it to a file
     np = NanoporeRead(fast_five_file=f5_path, twoD=False)  # make this a config arg
     ok = np.Initialize(job)
     if not ok:
         return None
     _l = np.read_label
     tF = job.fileStore.getLocalTempFile()
     fH = open(tF, "w")
     ok = np.Write(job, fH, initialize=False)
     if not ok:
         fH.close()
         return None
     fH.close()
     # then we gzip it and deliver it to the readstore and return the ledger line
     fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l)
     fH = open(tF, "rb")
     gz = gzip.open(fn.fullpathGetter(), "wb")
     shutil.copyfileobj(fH, gz)
     fH.close()
     gz.close()
     try:
         deliverOutput(job, fn, readstore_dir)
     except RuntimeError:
         job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l)
         return None
     return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter()))

Example #2

Show file

File: variantCaller.py Project: wgosal/signalAlign

    def __init__(self, full_data, variants, read_name, forward_mapped):
        """Marginalize over all posterior probabilities to give a per position read probability
        :param variants: bases to track probabilities
        :param full_data: path to full tsv file

                             ['contig', 'reference_index',
                              'reference_kmer', 'read_file',
                              'strand', 'event_index',
                              'event_mean', 'event_noise',
                              'event_duration', 'aligned_kmer',
                              'scaled_mean_current', 'scaled_noise',
                              'posterior_probability', 'descaled_event_mean',
                              'ont_model_mean', 'path_kmer']
        """
        self.read_name = read_name
        self.full_data = full_data
        self.variant_data = self.full_data[["X" in kmer for kmer in self.full_data["reference_kmer"]]]
        self.variants = sorted(variants)
        self.forward_mapped = forward_mapped
        self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'],
                                    list(self.variants)])
        self.contig = NanoporeRead.bytes_to_string(self.full_data["contig"][0])
        self.position_probs = pd.DataFrame()
        self.has_data = False
        self.per_read_calls = pd.DataFrame()
        self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped", "n_sites"],
                                             list(self.variants)])

Example #3

Show file

def organize_fast5s(fast5_locations):
    # gathered data
    fast5_to_read_id = dict()
    requires_event_calling = list()

    # examine each fast5
    for fast5 in fast5_locations:
        npr = NanoporeRead(fast5)
        success = npr.Initialize()
        read_id = npr.read_label
        fast5_id = os.path.basename(fast5)[:-6]
        fast5_to_read_id[fast5_id] = read_id
        if not success:
            requires_event_calling.append((fast5, read_id))
        npr.close()

    return fast5_to_read_id, requires_event_calling

Example #4

Show file

def organize_fast5s(fast5_locations, realign_all=False):
    # gathered data
    fast5_to_read_id = dict()
    requires_event_calling = list()

    # examine each fast5
    for fast5 in fast5_locations:
        npr = NanoporeRead(fast5, perform_kmer_event_alignment=False)
        success = npr._initialize_metadata()
        read_id = npr.read_label
        fast5_id = os.path.basename(fast5)[:-6]
        fast5_to_read_id[fast5_id] = read_id
        if not success or realign_all:
            requires_event_calling.append((fast5, read_id))
        npr.close()

    return fast5_to_read_id, requires_event_calling

Example #5

Show file

File: test_event_detection.py Project: wgosal/signalAlign

    def test_run_kmeralign_exe(self):
        path_to_bin = os.path.join(self.HOME, "bin")
        rna_fast5_path = os.path.abspath(self.tmp_rna_file2)
        nuc_sequence = "CAUCCUGCCCUGUGUUAUCCAGUUAUGAGAUAAAAAAUGAAUAUAAGAGUGCUUGUCAUUAUAAAAGUUUUCCUUUUUAUUACCAUCCAAGCCACCAGCUGCCAGCCACCAGCAGCCAGCUGCCAGCACUAGCUUUUUUUUUUUAGCACUUAGUAUUUAGCAGCAUUUAUUAACAGGUACUUUAAGAAUGAUGAAGCAUUGUUUUAAUCUCACUGACUAUGAAGGUUUUAGUUUCUGCUUUUGCAAUUGUGUUUGUGAAAUUUGAAUACUUGCAGGCUUUGUAUGUGAAUAAUUUUAGCGGCUGGUUGGAGAUAAUCCUACGGGAAUUACUUAAAACUGUGCUUUAACUAAAAUGAAUGAGCUUUAAAAUCCCUCCUCCUACUCCAUCAUCAUCCCACUAUUCAUCUUAUCUCAUUAUCAUCAACCUAUCCCACAUCCCUAUCACCACAGCAAUCCAA"
        rna_model_file = self.rna_model_file
        np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3))
        np_handle._initialize_metadata()

        dest = "/Analyses/SignalAlign_Basecall_1D_001/BaseCalled_template"
        self.rna_handle2.close()

        status = run_kmeralign_exe(rna_fast5_path, nuc_sequence,
                                   rna_model_file, dest, path_to_bin)
        rna_handle = Fast5(self.tmp_rna_file2, 'r+')

        events = np.array(rna_handle[dest])

        self.assertEqual(events[0]["raw_length"], 7)
        self.assertTrue(status)

Example #6

Show file

File: variantCaller.py Project: wgosal/signalAlign

    def get_data(self):
        """Calculate the normalized probability of variant for each nucleotide and across the read"""
        # final location of per position data and per read data
        data = []
        per_read_data = []
        for read_strand in (b"t", b"c"):
            read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand]
            read_strand = read_strand.decode("utf-8")
            if len(read_strand_specifc_data) == 0:
                continue
            for forward_mapped in set(self.variant_data["forward_mapped"]):
                mapping_strand = "-"
                if forward_mapped == b"forward":
                    mapping_strand = "+"
                strand_specifc_data = read_strand_specifc_data[read_strand_specifc_data["forward_mapped"] ==
                                                               forward_mapped]
                if len(strand_specifc_data) == 0:
                    continue
                # get positions on strand
                positions = set(strand_specifc_data["reference_position"])
                n_positions = len(positions)
                strand_read_nuc_data = [0] * len(self.variants)

                # marginalize probabilities for each position
                for pos in positions:
                    pos_data = strand_specifc_data[strand_specifc_data["reference_position"] == pos]
                    total_prob = 0
                    position_nuc_dict = {x: 0.0 for x in self.variants}
                    # Get total probability for each nucleotide
                    for nuc in set(pos_data["base"]):
                        nuc_data = pos_data[pos_data["base"] == nuc]
                        nuc_prob = sum(nuc_data["posterior_probability"])
                        total_prob += nuc_prob
                        position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob
                    # normalize probabilities over each position
                    nuc_data = [0] * len(self.variants)
                    for nuc in position_nuc_dict.keys():
                        index = self.variants.index(nuc)
                        nuc_data[index] = position_nuc_dict[nuc] / total_prob
                        strand_read_nuc_data[index] += nuc_data[index]

                    data.append(merge_lists([[self.read_name, self.contig, pos, read_strand, mapping_strand],
                                             nuc_data]))
                if n_positions > 0:
                    per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand, mapping_strand,
                                                       n_positions],
                                                      [prob / n_positions for prob in strand_read_nuc_data]]))

            self.position_probs = pd.DataFrame(data, columns=self.columns)
            self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns)
            self.has_data = True

        return self.position_probs

Example #7

Show file

File: variantCaller.py Project: wgosal/signalAlign

 def __init__(self, variant_data, variants, read_name):
     """Marginalize over all posterior probabilities to give a per position read probability
     :param variants: bases to track probabilities
     :param variant_data: variant data
     """
     self.read_name = read_name
     self.variant_data = variant_data
     self.variants = sorted(variants)
     self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'],
                                 list(self.variants)])
     self.contig = NanoporeRead.bytes_to_string(self.variant_data["contig"][0])
     self.position_probs = pd.DataFrame()
     self.has_data = False
     self.per_read_calls = pd.DataFrame()
     self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped",
                                           "n_sites"], list(self.variants)])

Example #8

Show file

File: test_event_detection.py Project: wgosal/signalAlign

 def test_load_from_raw(self):
     path_to_bin = os.path.join(self.HOME, "bin")
     np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3))
     np_handle._initialize_metadata()
     alignment_file = os.path.join(
         self.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.sam")
     saved_location = load_from_raw(np_handle, alignment_file,
                                    self.rna_model_file, path_to_bin)
     # close and reopen
     np_handle.close()
     np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3))
     # get events and validate
     events = np.array(
         np_handle.
         fastFive["/Analyses/Basecall_1D_001/BaseCalled_template/Events"])
     self.assertEqual(events[0]["raw_length"], 11)
     self.assertTrue("/Analyses/Basecall_1D_001/BaseCalled_template/Fastq"
                     in np_handle.fastFive)
     self.assertEqual(saved_location, "/Analyses/Basecall_1D_001")

Example #9

Show file

    def check_alignments(self,
                         true_alignments,
                         reads,
                         reference,
                         kmer_length,
                         contig_name,
                         extra_args=None,
                         rna=False):
        # TODO remove this from the framework and code
        true_alignments = lambda x: 1 / 0

        def get_kmer(start):
            kmer = referece_sequence[start:start + kmer_length]
            if type(kmer) is str:
                return kmer
            else:
                return bytes.decode(kmer)

        input_fast5s = glob.glob(os.path.join(reads, "*.fast5"))
        assert len(input_fast5s) > 0, "Didn't find test MinION reads"
        assert os.path.isfile(reference), "Didn't find reference sequence"

        # it's this or rewrite all the relative locations of the files
        os.chdir(BIN_PATH)

        # prep command
        run_signal_align = os.path.join(BIN_PATH, "runSignalAlign")
        # removed: --debug
        alignment_command = "{runsignalalign} run2 -d={reads} --bwa_reference={ref} -smt=threeState -o={testDir} " \
                            "".format(runsignalalign=run_signal_align, reads=reads, ref=reference,
                                      testDir="./signalAlign_unittest/")
        if extra_args is not None:
            alignment_command += extra_args

        # run signalAlign
        result = call(alignment_command, shell=True, bufsize=-1)
        self.assertTrue(
            result == 0, "Error running signalAlign. Command was {}".format(
                alignment_command))

        # get alignments
        test_alignments = glob.glob(
            "./signalAlign_unittest/tempFiles_alignment/*.tsv")
        self.assertTrue(
            len(test_alignments) == len(input_fast5s),
            "Didn't make all alignments got {got} should be {should}".format(
                got=len(test_alignments), should=len(input_fast5s)))

        # prep for verification
        referece_sequence = getFastaDictionary(reference)[contig_name]
        alignment2events = dict()
        for fast5 in input_fast5s:
            with closing(NanoporeRead(fast5, initialize=True)) as read:
                event_count = len(read.get_template_events())
                read_id = read.read_label
                self.assertTrue(
                    event_count > 0,
                    "Got no events for fast5 {} with read_id {}".format(
                        fast5, read_id))
                for alignment in test_alignments:
                    if os.path.basename(alignment).startswith(read_id):
                        self.assertTrue(
                            alignment not in alignment2events,
                            "Fast5 {} matched read_id {} with multiple output alignments"
                            .format(fast5, read_id))
                        alignment2events[alignment] = event_count

        for alignment in test_alignments:
            alignment_file = alignment.split("/")[-1]
            # expected = parse_alignment_full(os.path.join(true_alignments, alignment_file))
            obs = parse_alignment_full(alignment)
            for row in obs.itertuples():
                ref_pos = row[1]
                obs_kmer = row[2]
                strand = row[3]
                exp_kmer = get_kmer(ref_pos)
                if rna:
                    exp_kmer = exp_kmer[::-1]
                self.assertEqual(
                    obs_kmer,
                    exp_kmer,
                    msg="kmer at index {idx} on strand {strand} is {obs} "
                    "should be {exp}, file {f}".format(idx=ref_pos,
                                                       strand=strand,
                                                       obs=obs_kmer,
                                                       exp=exp_kmer,
                                                       f=alignment))
            signal_align_event_count = len(obs)
            intial_event_count = alignment2events[alignment]
            self.assertTrue(
                signal_align_event_count >= intial_event_count,
                "SignalAlign produced {} events, less than inital count {}".
                format(signal_align_event_count, intial_event_count))
            # this is a magic number
            self.assertTrue(
                signal_align_event_count <= intial_event_count * 3,
                "SignalAlign produced {} events, more than 3x the initial count {}"
                .format(signal_align_event_count, intial_event_count))

Example #10

Show file

File: signalAlignment.py Project: Alexle63/signalAlign

    def run(self):
        print("[SignalAlignment.run] INFO: Starting on {read}".format(
            read=self.in_fast5))
        if self.get_expectations:
            assert self.in_templateHmm is not None, "Need template HMM files for model training"
            if self.twoD_chemistry:
                assert self.in_complementHmm is not None, "Need compement HMM files for model training"
        if not os.path.isfile(self.in_fast5):
            print("[SignalAlignment.run] ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        # prep
        self.openTempFolder("tempFiles_%s" % self.read_name)
        if self.twoD_chemistry:
            npRead = NanoporeRead2D(fast_five_file=self.in_fast5,
                                    event_table=self.event_table,
                                    initialize=True)
        else:
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  event_table=self.event_table,
                                  initialize=True)
        #todo need to validate / generate events and nucleotide read

        # read label
        read_label = npRead.read_label  # use this to identify the read throughout
        self.read_label = read_label

        # nanopore read (event table, etc)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        if not (self.check_for_temp_file_existance
                and os.path.isfile(npRead_)):
            # TODO is this totally f****d for RNA because of 3'-5' mapping?
            fH = open(npRead_, "w")
            ok = npRead.Write(out_file=fH, initialize=True)
            fH.close()
            if not ok:
                self.failStop(
                    "[SignalAlignment.run] File: %s did not pass initial checks"
                    % self.read_name, npRead)
                return False

        # nucleotide read
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        ok = self.write_nucleotide_read(npRead, read_fasta_)
        if not ok:
            print(
                "[SignalAlignment.run] Failed to write nucleotide read.  Continuing execution."
            )

        # alignment info
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        strand = None
        reference_name = None
        if not (self.check_for_temp_file_existance
                and os.path.isfile(cigar_file_)):

            # need guide alignment to generate cigar file
            guide_alignment = None

            # get from alignment file
            if self.alignment_file is not None:
                guide_alignment = getGuideAlignmentFromAlignmentFile(
                    self.alignment_file, read_name=read_label)
                if guide_alignment is None:
                    print(
                        "[SignalAlignment.run] read {} not found in {}".format(
                            read_label, self.alignment_file))

            # get from bwa
            if guide_alignment is None and self.bwa_reference is not None:
                guide_alignment = generateGuideAlignment(
                    reference_fasta=self.bwa_reference,
                    query=read_fasta_,
                    temp_sam_path=temp_samfile_,
                    target_regions=self.target_regions)
                if guide_alignment is None:
                    print(
                        "[SignalAlignment.run] read {} could not be aligned with BWA"
                        .format(read_label))

            # could not map
            if guide_alignment is None:
                self.failStop(
                    "[SignalAlignment.run] ERROR getting guide alignment",
                    npRead)
                return False

            # ensure valid
            if not guide_alignment.validate():
                self.failStop(
                    "[SignalAlignment.run] ERROR invalid guide alignment",
                    npRead)
                return False
            strand = guide_alignment.strand
            reference_name = guide_alignment.reference_name

            # write cigar to file
            cig_handle = open(cigar_file_, "w")
            cig_handle.write(guide_alignment.cigar + "\n")
            cig_handle.close()

        # otherwise, get strand from file
        else:
            strand, reference_name = getInfoFromCigarFile(cigar_file_)

        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        # forward strand
        if strand == "+":
            if self.output_format == "full":
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".forward.tsv")
            elif self.output_format == "variantCaller":
                posteriors_file_path = os.path.join(
                    self.destination, read_label + model_label + ".tsv")
            else:
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".assignments.tsv")

        # backward strand
        elif strand == "-":
            if self.output_format == "full":
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".backward.tsv")
            elif self.output_format == "variantCaller":
                posteriors_file_path = os.path.join(
                    self.destination, read_label + model_label + ".tsv")
            else:
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".assignments.tsv")

        # sanity check
        else:
            self.failStop(
                "[SignalAlignment.run] ERROR Unexpected strand {}".format(
                    strand), npRead)
            return False

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(
                strand="template", version=npRead.version)
        if self.twoD_chemistry and self.in_complementHmm is None:
            pop1_complement = npRead.complement_model_id == "complement_median68pA_pop1.model"
            self.in_complementHmm = defaultModelFromVersion(
                strand="complement",
                version=npRead.version,
                pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run] ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalAlignment.run] NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm))

        # reference sequences
        assert os.path.isfile(self.forward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference)
        if self.backward_reference:
            assert os.path.isfile(self.backward_reference)
            backward_ref_flag = "-b {b_ref} ".format(
                b_ref=self.backward_reference)
        else:
            backward_ref_flag = ""

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in list(self.output_formats.keys()):
            self.failStop(
                "[SignalAlignment.run] ERROR illegal output format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        # twoD flag
        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""

        # commands
        if self.get_expectations:
            template_expectations_file_path = os.path.join(
                self.destination, read_label + ".template.expectations.tsv")
            complement_expectations_file_path = os.path.join(
                self.destination, read_label + ".complement.expectations.tsv")
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=self.path_to_signalMachine, model=stateMachineType_flag,
                            cigarFile=cigar_file_,
                            npRead=npRead_, readLabel=read_label, td=twoD_flag,
                            templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                            complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                            c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)
        else:
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=self.path_to_signalMachine, model=stateMachineType_flag, sparse=out_fmt,
                            cigarFile=cigar_file_,
                            readLabel=read_label, npRead=npRead_, td=twoD_flag,
                            t_model=template_model_flag, c_model=complement_model_flag,
                            posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)

        # run
        print("[SignalAlignment.run] running command: ", command, end="\n")
        try:
            command = command.split()

            if self.track_memory_usage:
                mem_command = ['/usr/bin/time', '-f', '\\nDEBUG_MAX_MEM:%M\\n']
                print(
                    "[SignalAlignment.run] Prepending command to track mem usage: {}"
                    .format(mem_command))
                mem_command.extend(command)
                command = mem_command

            output = subprocess.check_output(command, stderr=subprocess.STDOUT)
            output = str(output).split("\\n")
            for line in output:
                print("[SignalAlignment.run]    {}: {}".format(
                    read_label, line))
                if line.startswith("DEBUG_MAX_MEM"):
                    self.max_memory_usage_kb = int(line.split(":")[1])

        except Exception as e:
            print(
                "[SignalAlignment.run] exception ({}) running signalAlign: {}".
                format(type(e), e))
            raise e

        # save to fast5 file (if appropriate)
        if self.embed:
            print("[SignalAlignment.run] embedding into Fast5 ")

            data = self.read_in_signal_align_tsv(posteriors_file_path,
                                                 file_type=self.output_format)
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  twoD=self.twoD_chemistry,
                                  event_table=self.event_table)
            npRead.Initialize(None)
            signal_align_path = npRead.get_latest_basecall_edition(
                "/Analyses/SignalAlign_00{}", new=False)
            assert signal_align_path, "There is no path in Fast5 file: {}".format(
                "/Analyses/SignalAlign_00{}")
            output_path = npRead._join_path(signal_align_path,
                                            self.output_format)
            npRead.write_data(data, output_path)

            # Todo add attributes to signalalign output
            if self.output_format == "full":
                print(
                    "[SignalAlignment.run] writing maximum expected alignment "
                )
                alignment = mea_alignment_from_signal_align(None, events=data)
                mae_path = npRead._join_path(signal_align_path,
                                             "MEA_alignment_labels")
                events = npRead.get_template_events()
                if events:
                    if strand == "-":
                        minus = True
                    else:
                        minus = False
                    labels = match_events_with_signalalign(
                        sa_events=alignment,
                        event_detections=np.asanyarray(npRead.template_events),
                        minus=minus,
                        rna=npRead.is_read_rna())
                    npRead.write_data(labels, mae_path)
                    sam_string = str()
                    if os.path.isfile(temp_samfile_):
                        with open(temp_samfile_, 'r') as test:
                            for line in test:
                                sam_string += line
                    sam_path = npRead._join_path(signal_align_path, "sam")
                    # print(sam_string)
                    npRead.write_data(data=sam_string,
                                      location=sam_path,
                                      compression=None)

        # self.temp_folder.remove_folder()
        return True

Example #11

Show file

File: organize_fast5_directory.py Project: wgosal/signalAlign

def fast5_file_organization_service(work_queue,
                                    done_queue,
                                    output_dir_count,
                                    output_base,
                                    output_index_base,
                                    copy_files,
                                    service_name="fast5_file_organization"):
    # prep
    total_handled = 0
    failure_count = 0
    total_reads = 0
    name = current_process().name
    index_files = {}

    #catch overall exceptions
    try:

        # each thread and outputdir gets its own index file (start header for all of them)
        for i in range(output_dir_count):
            index_file = "{}{}_{}.tsv".format(output_index_base, i, name)
            write_header = not os.path.isfile(index_file)
            index_files[i] = open(index_file, 'a')
            if write_header:
                index_files[i].write("##{}:{}\n".format(
                    FAST5_ROOT, get_output_directory(output_base, i)))
                index_files[i].write("#{}\t{}\t{}\n".format(
                    FAST5_LOCATION, READ_ID, RUN_NAME))
        assert len(
            index_files
        ) == output_dir_count, "unexpected count of index files {} (expected {})".format(
            len(index_files), output_dir_count)

        idx = -1
        for f in iter(work_queue.get, 'STOP'):
            # randomly place files in appropriate directory
            idx = (idx + 1) % output_dir_count

            try:
                # sanity check
                assert 0 <= idx < output_dir_count

                # file organization
                destination_dir = get_output_directory(output_base, idx)
                source = f[FAST5_SRC_LOCATION]
                filename = os.path.basename(source)
                destination = os.path.join(destination_dir, filename)
                action = shutil.move
                if copy_files: action = shutil.copy

                # fast5 organization
                read = NanoporeRead(source)
                if not read._initialize_metadata():
                    failure_count += 1
                    continue
                read_id = read.read_label
                run_id = read.run_id
                assert None not in [
                    read_id, run_id
                ], "Missing read or run id for {}".format(source)

                # move or copy the file
                action(source, destination)

                # write the contents to the index
                index_files[idx].write("{}\t{}\t{}\n".format(
                    destination, read_id, run_id))

            except Exception as e:
                # get error and log it
                message = "{}:{}".format(type(e), str(e))
                error = "{} '{}' failed with: {}".format(
                    service_name,
                    current_process().name, message)
                print("[{}] ".format(service_name) + error)
                done_queue.put(error)
                failure_count += 1

            finally:
                # increment total handling
                total_handled += 1

    except Exception as e:
        # get error and log it
        message = "{}:{}".format(type(e), str(e))
        error = "{} '{}' critically failed with: {}".format(
            service_name,
            current_process().name, message)
        print("[{}] ".format(service_name) + error)
        done_queue.put(error)

    finally:
        # close all index files
        for index_file in index_files.values():
            if index_file is not None: index_file.close()

        # logging and final reporting
        print("[%s] '%s' completed %d calls with %d failures" %
              (service_name, current_process().name, total_handled,
               failure_count))
        done_queue.put("{}:{}".format(TOTAL_KEY, total_handled))
        done_queue.put("{}:{}".format(FAILURE_KEY, failure_count))

Example #12

Show file

File: sequencing_summary.py Project: wgosal/signalAlign

def get_alignment_summary_info(fast5s, alignment_file, pass_threshold=7, gap_size=10, verbose=False,
                               max_reads=100, number=0):
    """Filter fast5 files based on a quality threhsold and if there is an alignment"""
    # collect for every read
    fast5_dict = defaultdict()
    # loop through fast5s
    for fast5_path in fast5s:
        assert os.path.exists(fast5_path), "fast5 path does not exist: {}".format(fast5_path)
        f5h = NanoporeRead(fast5_path)
        f5h._initialize_metadata()
        read_name = f5h.read_label
        fast5_dict[read_name] = fast5_path
    print("Created read_id to fast5_path mapping")
    # summary data stored here
    mapped_reads = get_summary_info_table(list(fast5_dict.keys()))
    # grab aligned segment
    seen_counter = 0
    reads_seen = set()
    print("first_len reads_seen: {}".format(len(reads_seen)), file=sys.stderr)

    with closing(pysam.AlignmentFile(alignment_file, 'rb' if alignment_file.endswith("bam") else 'r')) as aln:
        for aligned_segment in aln.fetch(until_eof=True):
            if seen_counter > max_reads:
                break
            try:
                print("reads_seen: {}".format(len(reads_seen)), file=sys.stderr)
                read_name = aligned_segment.qname.split("_")[0]
                fast5_path = fast5_dict[read_name]
                if read_name not in reads_seen:
                    reads_seen |= {read_name}
                    seen_counter += 1
                    mapped_reads["seen"][read_name] = 1
                    print(fast5_path)
                    cl_handle = CreateLabels(fast5_path, kmer_index=2)
                    seq_start_time = cl_handle.raw_attributes['start_time']
                    q_score_average = 0
                    if aligned_segment.query_qualities is None:
                        print("Alignment done with fasta instead of fastq so read qualities will not be reported")
                    else:
                        q_score_average = np.mean(aligned_segment.query_qualities)

                    mapped_reads["q_score_average"][read_name] = q_score_average
                    mapped_reads["seq_start_time"][read_name] = seq_start_time

                if aligned_segment.is_secondary or aligned_segment.is_unmapped \
                        or aligned_segment.is_supplementary or aligned_segment.has_tag("SA") \
                        or q_score_average < pass_threshold:

                    if aligned_segment.is_secondary:
                        mapped_reads["num_secondary_mappings"][read_name] += 1
                    if aligned_segment.is_unmapped:
                        mapped_reads["no_mapping"][read_name] = 1
                    if aligned_segment.is_supplementary or aligned_segment.has_tag("SA"):
                        mapped_reads["chimera_mapping"][read_name] += 1
                else:
                    mapped_reads["map_q"][read_name] = aligned_segment.mapq

                    soft_clipped_percentage = \
                        1 - float(len(aligned_segment.query_alignment_sequence)) / len(aligned_segment.query_sequence)
                    mapped_reads["soft_clipped_percentage"][read_name] = soft_clipped_percentage

                    handle = AlignmentSegmentWrapper(aligned_segment)
                    handle.initialize()

                    accuracy = handle.alignment_accuracy()
                    mapped_reads["basecalled_accuracy"][read_name] = accuracy
                    try:
                        mea = cl_handle.add_mea_labels(number=int(number))
                        sa_full = cl_handle.add_signal_align_predictions(number=int(number), add_basecall=True)
                        all_basecall_data = []
                        for name, basecall_data in cl_handle.aligned_signal.prediction.items():
                            if "guide" in name:
                                all_basecall_data.extend(basecall_data)

                        alignment_summary = analyze_event_skips(mea, sa_full, all_basecall_data, generate_plot=False)
                        flagged_gaps_summary = flag_large_gaps(alignment_summary, gap_size, verbose=verbose)
                        counter = 0
                        total_distance = 0
                        for gap in flagged_gaps_summary:
                            if gap["mea_peak_distance"] > 10:
                                counter += 1
                                total_distance += gap["mea_peak_distance"]
                        if counter > 0:
                            mapped_reads["num_flagged_gaps"][read_name] = counter
                            mapped_reads["avg_flagged_gap_size"][read_name] = float(total_distance) / counter

                        if mapped_reads["q_score_average"][read_name] > pass_threshold:
                            mapped_reads["pass"][read_name] = 1

                    except KeyError:
                        mapped_reads["other_errors"][read_name] = 1
            except Exception as e:
                print(e, file=sys.stderr)

        return mapped_reads[mapped_reads["seen"] == 1]

Example #13

Show file

def event_detection(work_queue,
                    done_queue,
                    alignment_file,
                    model_file_location,
                    event_detection_strategy=None,
                    event_detection_params=None,
                    tmp_directory=None,
                    write_failed_alignments=True,
                    service_name="event_detection"):
    # prep
    total_handled = 0
    failure_count = 0

    #catch overall exceptions
    try:
        for tmp in iter(work_queue.get, 'STOP'):
            # get data from iterator
            fast5, read_id = tmp['fast5']
            np_handle = None

            # catch exceptions on each element
            try:
                np_handle = NanoporeRead(fast5, initialize=False)
                success = load_from_raw(
                    np_handle,
                    alignment_file,
                    model_file_location,
                    write_failed_alignments=write_failed_alignments)
                if not success:
                    raise Exception(
                        "load_from_raw failed on read {} in {}".format(
                            read_id, fast5))

            except Exception as e:
                # get error and log it
                message = "{}:{}".format(type(e), str(e))
                error = "{} '{}' failed with: {}".format(
                    service_name,
                    current_process().name, message)
                print("[{}] ".format(service_name) + error)
                done_queue.put(error)
                failure_count += 1

            finally:
                if np_handle is not None: np_handle.close()

            # increment total handling
            total_handled += 1

    except Exception as e:
        # get error and log it
        message = "{}:{}".format(type(e), str(e))
        error = "{} '{}' critically failed with: {}".format(
            service_name,
            current_process().name, message)
        print("[{}] ".format(service_name) + error)
        done_queue.put(error)

    finally:
        # logging and final reporting
        print("[%s] '%s' completed %d calls with %d failures" %
              (service_name, current_process().name, total_handled,
               failure_count))
        done_queue.put("{}:{}".format(TOTAL_KEY, total_handled))
        done_queue.put("{}:{}".format(FAILURE_KEY, failure_count))

Example #14

Show file

    def run(self, get_expectations=False):
        print("[SignalAlignment.run]INFO: Starting on {read}".format(
            read=self.in_fast5),
              file=sys.stderr)
        if get_expectations:
            assert self.in_templateHmm is not None and self.in_complementHmm is not None,\
                "Need HMM files for model training"
        # file checks
        if os.path.isfile(self.in_fast5) is False:
            print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        self.openTempFolder("tempFiles_%s" % self.read_name)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        npRead = NanoporeRead(fast_five_file=self.in_fast5,
                              twoD=self.twoD_chemistry)
        fH = open(npRead_, "w")
        ok = npRead.Write(parent_job=None, out_file=fH, initialize=True)
        fH.close()
        if not ok:
            self.failStop(
                "[SignalAlignment.run]File: %s did not pass initial checks" %
                self.read_name, npRead)
            return False

        read_label = npRead.read_label  # use this to identify the read throughout
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        if self.twoD_chemistry:
            ok, version, pop1_complement = self.prepare_twod(
                nanopore_read=npRead, twod_read_path=read_fasta_)
        else:
            ok, version, _ = self.prepare_oned(nanopore_read=npRead,
                                               oned_read_path=read_fasta_)
            pop1_complement = None

        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        guide_alignment = generateGuideAlignment(
            bwa_index=self.bwa_index,
            query=read_fasta_,
            temp_sam_path=temp_samfile_,
            target_regions=self.target_regions)
        ok = guide_alignment.validate(self.reference_map.keys())
        if not ok:
            self.failStop("[SignalAlignment.run]ERROR getting guide alignment",
                          npRead)
            return False

        cig_handle = open(cigar_file_, "w")
        cig_handle.write(guide_alignment.cigar + "\n")
        cig_handle.close()

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        posteriors_file_path = ''
        # forward strand
        if guide_alignment.strand == "+":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # backward strand
        if guide_alignment.strand == "-":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # Alignment/Expectations routine
        path_to_signalAlign = "./signalMachine"

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(strand="template",
                                                          version=version)
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.in_complementHmm = defaultModelFromVersion(
                    strand="complement",
                    version=version,
                    pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalALignment.run]NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm),
            file=sys.stderr)

        # reference sequences
        assert self.reference_map[
            guide_alignment.reference_name]["forward"] is not None
        assert self.reference_map[
            guide_alignment.reference_name]["backward"] is not None
        forward_reference = self.reference_map[
            guide_alignment.reference_name]["forward"]
        backward_reference = self.reference_map[
            guide_alignment.reference_name]["backward"]
        assert os.path.isfile(forward_reference)
        assert os.path.isfile(backward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=forward_reference)
        backward_ref_flag = "-b {b_ref} ".format(b_ref=backward_reference)

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in self.output_formats.keys():
            self.failStop(
                "[SignalAlignment.run]ERROR illegal outpur format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""
        # commands
        if get_expectations:
            template_expectations_file_path = self.destination + read_label + ".template.expectations"
            complement_expectations_file_path = self.destination + read_label + ".complement.expectations"

            command = \
                "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations}"\
                .format(vA=path_to_signalAlign, model=stateMachineType_flag,
                        f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_,
                        npRead=npRead_, readLabel=read_label, td=twoD_flag,
                        templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                        complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                        c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                        trim=trim_flag, degen=degenerate_flag, sparse=out_fmt)
        else:
            print("read_label", read_label)
            command = \
                "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel}"\
                .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt,
                        f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_,
                        readLabel=read_label, npRead=npRead_, td=twoD_flag,
                        t_model=template_model_flag, c_model=complement_model_flag,
                        posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                        trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag)

        # run
        print("signalAlign - running command: ",
              command,
              end="\n",
              file=sys.stderr)
        os.system(command)
        self.temp_folder.remove_folder()
        return True

Example #15

Show file

    def run(self, get_expectations=False):
        print("[SignalAlignment.run]INFO: Starting on {read}".format(
            read=self.in_fast5),
              file=sys.stderr)
        if get_expectations:
            assert self.in_templateHmm is not None and self.in_complementHmm is not None, \
                "Need HMM files for model training"
        # file checks
        if os.path.isfile(self.in_fast5) is False:
            print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        self.openTempFolder("tempFiles_%s" % self.read_name)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        # TODO is this totally f****d for RNA because of 3'-5' mapping?
        npRead = NanoporeRead(fast_five_file=self.in_fast5,
                              twoD=self.twoD_chemistry,
                              event_table=self.event_table)
        fH = open(npRead_, "w")
        ok = npRead.Write(parent_job=None, out_file=fH, initialize=True)
        fH.close()
        if not ok:
            self.failStop(
                "[SignalAlignment.run]File: %s did not pass initial checks" %
                self.read_name, npRead)
            return False

        read_label = npRead.read_label  # use this to identify the read throughout
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        if self.twoD_chemistry:
            ok, version, pop1_complement = self.prepare_twod(
                nanopore_read=npRead, twod_read_path=read_fasta_)
        else:
            ok, version, _ = self.prepare_oned(nanopore_read=npRead,
                                               oned_read_path=read_fasta_)
            pop1_complement = None
        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        guide_alignment = generateGuideAlignment(
            bwa_index=self.bwa_index,
            query=read_fasta_,
            temp_sam_path=temp_samfile_,
            target_regions=self.target_regions)
        # ok = guide_alignment.validate(list(self.reference_map.keys()))
        ok = guide_alignment.validate()

        if not ok:
            self.failStop("[SignalAlignment.run]ERROR getting guide alignment",
                          npRead)
            return False

        cig_handle = open(cigar_file_, "w")
        cig_handle.write(guide_alignment.cigar + "\n")
        cig_handle.close()

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        posteriors_file_path = ''
        # forward strand
        if guide_alignment.strand == "+":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # backward strand
        if guide_alignment.strand == "-":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # Alignment/Expectations routine
        path_to_signalAlign = "./signalMachine"

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(strand="template",
                                                          version=version)
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.in_complementHmm = defaultModelFromVersion(
                    strand="complement",
                    version=version,
                    pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalALignment.run]NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm),
            file=sys.stderr)

        # reference sequences
        assert os.path.isfile(self.forward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference)
        if self.backward_reference:
            assert os.path.isfile(self.backward_reference)
            backward_ref_flag = "-b {b_ref} ".format(
                b_ref=self.backward_reference)
        else:
            backward_ref_flag = ""

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in list(self.output_formats.keys()):
            self.failStop(
                "[SignalAlignment.run]ERROR illegal output format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""
        # commands
        if get_expectations:
            template_expectations_file_path = self.destination + read_label + ".template.expectations"
            complement_expectations_file_path = self.destination + read_label + ".complement.expectations"

            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=path_to_signalAlign, model=stateMachineType_flag,
                            cigarFile=cigar_file_,
                            npRead=npRead_, readLabel=read_label, td=twoD_flag,
                            templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                            complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                            c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=guide_alignment.reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)
        else:
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt,
                            cigarFile=cigar_file_,
                            readLabel=read_label, npRead=npRead_, td=twoD_flag,
                            t_model=template_model_flag, c_model=complement_model_flag,
                            posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=guide_alignment.reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)

        # run
        print("signalAlign - running command: ",
              command,
              end="\n",
              file=sys.stderr)
        os.system(command)
        if self.embed:
            print("signalAlign - embedding into Fast5 ", file=sys.stderr)

            data = self.read_in_signal_align_tsv(posteriors_file_path,
                                                 file_type=self.output_format)
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  twoD=self.twoD_chemistry,
                                  event_table=self.event_table)
            npRead.Initialize(None)
            signal_align_path = npRead.get_latest_basecall_edition(
                "/Analyses/SignalAlign_00{}", new=False)
            assert signal_align_path, "There is no path in Fast5 file: {}".format(
                "/Analyses/SignalAlign_00{}")
            output_path = npRead._join_path(signal_align_path,
                                            self.output_format)
            npRead.write_data(data, output_path)

            # Todo add attributes to signalalign output
            if self.output_format == "full":
                print("signalAlign - writing maximum expected alignment ",
                      file=sys.stderr)
                alignment = mea_alignment_from_signal_align(None, events=data)
                mae_path = npRead._join_path(signal_align_path,
                                             "MEA_alignment_labels")
                events = npRead.get_template_events()
                if events:
                    if guide_alignment.strand == "-":
                        minus = True
                    else:
                        minus = False
                    labels = match_events_with_signalalign(
                        sa_events=alignment,
                        event_detections=np.asanyarray(npRead.template_events),
                        minus=minus,
                        rna=npRead.is_read_rna())
                    npRead.write_data(labels, mae_path)
                    sam_string = str()
                    with open(temp_samfile_, 'r') as test:
                        for line in test:
                            sam_string += line
                    sam_path = npRead._join_path(signal_align_path, "sam")
                    # print(sam_string)
                    npRead.write_data(data=sam_string,
                                      location=sam_path,
                                      compression=None)

        # self.temp_folder.remove_folder()
        return True

Example #16

Show file

File: variantCaller.py Project: wgosal/signalAlign

    def get_data(self):
        """Calculate the normalized probability of variant for each nucleotide and across the read"""
        # final location of per position data and per read data
        data = []
        per_read_data = []
        if self.forward_mapped:
            mapping_strands = ["+", "-"]
        else:
            mapping_strands = ["-", "+"]

        if len(self.variant_data) > 0:
            kmer_len_1 = len(self.variant_data["reference_kmer"].iloc[0]) - 1
            mapping_index = 0
            for read_strand in ("t", "c"):
                read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand]
                # read_strand = read_strand.decode("utf-8")
                if len(read_strand_specifc_data) == 0:
                    continue
                # get positions on strand
                positions = sorted(set(read_strand_specifc_data["reference_index"]))

                if mapping_strands[mapping_index] == "-":
                    positions = positions[::-1]

                strand_read_nuc_data = [0] * len(self.variants)

                # marginalize probabilities for each position
                n_positions = 0
                for pos in positions:
                    pos_data = read_strand_specifc_data[read_strand_specifc_data["reference_index"] == pos]
                    if pos_data["aligned_kmer"].iloc[0][kmer_len_1] != "X":
                        continue
                    n_positions += 1
                    total_prob = 0
                    position_nuc_dict = {x: 0.0 for x in self.variants}
                    # Get total probability for each nucleotide
                    for nuc in self.variants:
                        # kmer_len_1 = pos_data["reference_kmer"].iloc[0].find("X")
                        # print(pos_data["reference_kmer"].iloc[0])
                        nuc_data = pos_data[[nuc == kmer[kmer_len_1] for kmer in pos_data["path_kmer"]]]
                        nuc_prob = sum(nuc_data["posterior_probability"])
                        total_prob += nuc_prob
                        position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob
                    # normalize probabilities over each position
                    nuc_data = [0] * len(self.variants)
                    for index, nuc in enumerate(self.variants):
                        assert total_prob > 0, "Check 'variants' parameter. There seems to be no kmers with those " \
                                               "variant characters"
                        nuc_data[index] = position_nuc_dict[nuc] / total_prob
                        strand_read_nuc_data[index] += nuc_data[index]
                    data.append(merge_lists([[self.read_name, self.contig, pos, read_strand,
                                              mapping_strands[mapping_index]], nuc_data]))
                if n_positions > 0:
                    per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand,
                                                       mapping_strands[mapping_index], n_positions],
                                                      [prob / n_positions for prob in strand_read_nuc_data]]))
                mapping_index += 1
            self.position_probs = pd.DataFrame(data, columns=self.columns)
            self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns)
            self.has_data = True

        else:
            self.has_data = False

        return self.position_probs