Example #1
0
 def makeNanoporeRead(f5_path):
     # here we load the NanoporeRead and write it to a file
     np = NanoporeRead(fast_five_file=f5_path, twoD=False)  # make this a config arg
     ok = np.Initialize(job)
     if not ok:
         return None
     _l = np.read_label
     tF = job.fileStore.getLocalTempFile()
     fH = open(tF, "w")
     ok = np.Write(job, fH, initialize=False)
     if not ok:
         fH.close()
         return None
     fH.close()
     # then we gzip it and deliver it to the readstore and return the ledger line
     fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l)
     fH = open(tF, "rb")
     gz = gzip.open(fn.fullpathGetter(), "wb")
     shutil.copyfileobj(fH, gz)
     fH.close()
     gz.close()
     try:
         deliverOutput(job, fn, readstore_dir)
     except RuntimeError:
         job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l)
         return None
     return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter()))
Example #2
0
    def __init__(self, full_data, variants, read_name, forward_mapped):
        """Marginalize over all posterior probabilities to give a per position read probability
        :param variants: bases to track probabilities
        :param full_data: path to full tsv file

                             ['contig', 'reference_index',
                              'reference_kmer', 'read_file',
                              'strand', 'event_index',
                              'event_mean', 'event_noise',
                              'event_duration', 'aligned_kmer',
                              'scaled_mean_current', 'scaled_noise',
                              'posterior_probability', 'descaled_event_mean',
                              'ont_model_mean', 'path_kmer']
        """
        self.read_name = read_name
        self.full_data = full_data
        self.variant_data = self.full_data[["X" in kmer for kmer in self.full_data["reference_kmer"]]]
        self.variants = sorted(variants)
        self.forward_mapped = forward_mapped
        self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'],
                                    list(self.variants)])
        self.contig = NanoporeRead.bytes_to_string(self.full_data["contig"][0])
        self.position_probs = pd.DataFrame()
        self.has_data = False
        self.per_read_calls = pd.DataFrame()
        self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped", "n_sites"],
                                             list(self.variants)])
Example #3
0
def organize_fast5s(fast5_locations):
    # gathered data
    fast5_to_read_id = dict()
    requires_event_calling = list()

    # examine each fast5
    for fast5 in fast5_locations:
        npr = NanoporeRead(fast5)
        success = npr.Initialize()
        read_id = npr.read_label
        fast5_id = os.path.basename(fast5)[:-6]
        fast5_to_read_id[fast5_id] = read_id
        if not success:
            requires_event_calling.append((fast5, read_id))
        npr.close()

    return fast5_to_read_id, requires_event_calling
Example #4
0
def organize_fast5s(fast5_locations, realign_all=False):
    # gathered data
    fast5_to_read_id = dict()
    requires_event_calling = list()

    # examine each fast5
    for fast5 in fast5_locations:
        npr = NanoporeRead(fast5, perform_kmer_event_alignment=False)
        success = npr._initialize_metadata()
        read_id = npr.read_label
        fast5_id = os.path.basename(fast5)[:-6]
        fast5_to_read_id[fast5_id] = read_id
        if not success or realign_all:
            requires_event_calling.append((fast5, read_id))
        npr.close()

    return fast5_to_read_id, requires_event_calling
    def test_run_kmeralign_exe(self):
        path_to_bin = os.path.join(self.HOME, "bin")
        rna_fast5_path = os.path.abspath(self.tmp_rna_file2)
        nuc_sequence = "CAUCCUGCCCUGUGUUAUCCAGUUAUGAGAUAAAAAAUGAAUAUAAGAGUGCUUGUCAUUAUAAAAGUUUUCCUUUUUAUUACCAUCCAAGCCACCAGCUGCCAGCCACCAGCAGCCAGCUGCCAGCACUAGCUUUUUUUUUUUAGCACUUAGUAUUUAGCAGCAUUUAUUAACAGGUACUUUAAGAAUGAUGAAGCAUUGUUUUAAUCUCACUGACUAUGAAGGUUUUAGUUUCUGCUUUUGCAAUUGUGUUUGUGAAAUUUGAAUACUUGCAGGCUUUGUAUGUGAAUAAUUUUAGCGGCUGGUUGGAGAUAAUCCUACGGGAAUUACUUAAAACUGUGCUUUAACUAAAAUGAAUGAGCUUUAAAAUCCCUCCUCCUACUCCAUCAUCAUCCCACUAUUCAUCUUAUCUCAUUAUCAUCAACCUAUCCCACAUCCCUAUCACCACAGCAAUCCAA"
        rna_model_file = self.rna_model_file
        np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3))
        np_handle._initialize_metadata()

        dest = "/Analyses/SignalAlign_Basecall_1D_001/BaseCalled_template"
        self.rna_handle2.close()

        status = run_kmeralign_exe(rna_fast5_path, nuc_sequence,
                                   rna_model_file, dest, path_to_bin)
        rna_handle = Fast5(self.tmp_rna_file2, 'r+')

        events = np.array(rna_handle[dest])

        self.assertEqual(events[0]["raw_length"], 7)
        self.assertTrue(status)
Example #6
0
    def get_data(self):
        """Calculate the normalized probability of variant for each nucleotide and across the read"""
        # final location of per position data and per read data
        data = []
        per_read_data = []
        for read_strand in (b"t", b"c"):
            read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand]
            read_strand = read_strand.decode("utf-8")
            if len(read_strand_specifc_data) == 0:
                continue
            for forward_mapped in set(self.variant_data["forward_mapped"]):
                mapping_strand = "-"
                if forward_mapped == b"forward":
                    mapping_strand = "+"
                strand_specifc_data = read_strand_specifc_data[read_strand_specifc_data["forward_mapped"] ==
                                                               forward_mapped]
                if len(strand_specifc_data) == 0:
                    continue
                # get positions on strand
                positions = set(strand_specifc_data["reference_position"])
                n_positions = len(positions)
                strand_read_nuc_data = [0] * len(self.variants)

                # marginalize probabilities for each position
                for pos in positions:
                    pos_data = strand_specifc_data[strand_specifc_data["reference_position"] == pos]
                    total_prob = 0
                    position_nuc_dict = {x: 0.0 for x in self.variants}
                    # Get total probability for each nucleotide
                    for nuc in set(pos_data["base"]):
                        nuc_data = pos_data[pos_data["base"] == nuc]
                        nuc_prob = sum(nuc_data["posterior_probability"])
                        total_prob += nuc_prob
                        position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob
                    # normalize probabilities over each position
                    nuc_data = [0] * len(self.variants)
                    for nuc in position_nuc_dict.keys():
                        index = self.variants.index(nuc)
                        nuc_data[index] = position_nuc_dict[nuc] / total_prob
                        strand_read_nuc_data[index] += nuc_data[index]

                    data.append(merge_lists([[self.read_name, self.contig, pos, read_strand, mapping_strand],
                                             nuc_data]))
                if n_positions > 0:
                    per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand, mapping_strand,
                                                       n_positions],
                                                      [prob / n_positions for prob in strand_read_nuc_data]]))

            self.position_probs = pd.DataFrame(data, columns=self.columns)
            self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns)
            self.has_data = True

        return self.position_probs
Example #7
0
 def __init__(self, variant_data, variants, read_name):
     """Marginalize over all posterior probabilities to give a per position read probability
     :param variants: bases to track probabilities
     :param variant_data: variant data
     """
     self.read_name = read_name
     self.variant_data = variant_data
     self.variants = sorted(variants)
     self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'],
                                 list(self.variants)])
     self.contig = NanoporeRead.bytes_to_string(self.variant_data["contig"][0])
     self.position_probs = pd.DataFrame()
     self.has_data = False
     self.per_read_calls = pd.DataFrame()
     self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped",
                                           "n_sites"], list(self.variants)])
 def test_load_from_raw(self):
     path_to_bin = os.path.join(self.HOME, "bin")
     np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3))
     np_handle._initialize_metadata()
     alignment_file = os.path.join(
         self.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.sam")
     saved_location = load_from_raw(np_handle, alignment_file,
                                    self.rna_model_file, path_to_bin)
     # close and reopen
     np_handle.close()
     np_handle = NanoporeRead(os.path.abspath(self.tmp_rna_file3))
     # get events and validate
     events = np.array(
         np_handle.
         fastFive["/Analyses/Basecall_1D_001/BaseCalled_template/Events"])
     self.assertEqual(events[0]["raw_length"], 11)
     self.assertTrue("/Analyses/Basecall_1D_001/BaseCalled_template/Fastq"
                     in np_handle.fastFive)
     self.assertEqual(saved_location, "/Analyses/Basecall_1D_001")
Example #9
0
    def check_alignments(self,
                         true_alignments,
                         reads,
                         reference,
                         kmer_length,
                         contig_name,
                         extra_args=None,
                         rna=False):
        # TODO remove this from the framework and code
        true_alignments = lambda x: 1 / 0

        def get_kmer(start):
            kmer = referece_sequence[start:start + kmer_length]
            if type(kmer) is str:
                return kmer
            else:
                return bytes.decode(kmer)

        input_fast5s = glob.glob(os.path.join(reads, "*.fast5"))
        assert len(input_fast5s) > 0, "Didn't find test MinION reads"
        assert os.path.isfile(reference), "Didn't find reference sequence"

        # it's this or rewrite all the relative locations of the files
        os.chdir(BIN_PATH)

        # prep command
        run_signal_align = os.path.join(BIN_PATH, "runSignalAlign")
        # removed: --debug
        alignment_command = "{runsignalalign} run2 -d={reads} --bwa_reference={ref} -smt=threeState -o={testDir} " \
                            "".format(runsignalalign=run_signal_align, reads=reads, ref=reference,
                                      testDir="./signalAlign_unittest/")
        if extra_args is not None:
            alignment_command += extra_args

        # run signalAlign
        result = call(alignment_command, shell=True, bufsize=-1)
        self.assertTrue(
            result == 0, "Error running signalAlign. Command was {}".format(
                alignment_command))

        # get alignments
        test_alignments = glob.glob(
            "./signalAlign_unittest/tempFiles_alignment/*.tsv")
        self.assertTrue(
            len(test_alignments) == len(input_fast5s),
            "Didn't make all alignments got {got} should be {should}".format(
                got=len(test_alignments), should=len(input_fast5s)))

        # prep for verification
        referece_sequence = getFastaDictionary(reference)[contig_name]
        alignment2events = dict()
        for fast5 in input_fast5s:
            with closing(NanoporeRead(fast5, initialize=True)) as read:
                event_count = len(read.get_template_events())
                read_id = read.read_label
                self.assertTrue(
                    event_count > 0,
                    "Got no events for fast5 {} with read_id {}".format(
                        fast5, read_id))
                for alignment in test_alignments:
                    if os.path.basename(alignment).startswith(read_id):
                        self.assertTrue(
                            alignment not in alignment2events,
                            "Fast5 {} matched read_id {} with multiple output alignments"
                            .format(fast5, read_id))
                        alignment2events[alignment] = event_count

        for alignment in test_alignments:
            alignment_file = alignment.split("/")[-1]
            # expected = parse_alignment_full(os.path.join(true_alignments, alignment_file))
            obs = parse_alignment_full(alignment)
            for row in obs.itertuples():
                ref_pos = row[1]
                obs_kmer = row[2]
                strand = row[3]
                exp_kmer = get_kmer(ref_pos)
                if rna:
                    exp_kmer = exp_kmer[::-1]
                self.assertEqual(
                    obs_kmer,
                    exp_kmer,
                    msg="kmer at index {idx} on strand {strand} is {obs} "
                    "should be {exp}, file {f}".format(idx=ref_pos,
                                                       strand=strand,
                                                       obs=obs_kmer,
                                                       exp=exp_kmer,
                                                       f=alignment))
            signal_align_event_count = len(obs)
            intial_event_count = alignment2events[alignment]
            self.assertTrue(
                signal_align_event_count >= intial_event_count,
                "SignalAlign produced {} events, less than inital count {}".
                format(signal_align_event_count, intial_event_count))
            # this is a magic number
            self.assertTrue(
                signal_align_event_count <= intial_event_count * 3,
                "SignalAlign produced {} events, more than 3x the initial count {}"
                .format(signal_align_event_count, intial_event_count))
Example #10
0
    def run(self):
        print("[SignalAlignment.run] INFO: Starting on {read}".format(
            read=self.in_fast5))
        if self.get_expectations:
            assert self.in_templateHmm is not None, "Need template HMM files for model training"
            if self.twoD_chemistry:
                assert self.in_complementHmm is not None, "Need compement HMM files for model training"
        if not os.path.isfile(self.in_fast5):
            print("[SignalAlignment.run] ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        # prep
        self.openTempFolder("tempFiles_%s" % self.read_name)
        if self.twoD_chemistry:
            npRead = NanoporeRead2D(fast_five_file=self.in_fast5,
                                    event_table=self.event_table,
                                    initialize=True)
        else:
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  event_table=self.event_table,
                                  initialize=True)
        #todo need to validate / generate events and nucleotide read

        # read label
        read_label = npRead.read_label  # use this to identify the read throughout
        self.read_label = read_label

        # nanopore read (event table, etc)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        if not (self.check_for_temp_file_existance
                and os.path.isfile(npRead_)):
            # TODO is this totally f****d for RNA because of 3'-5' mapping?
            fH = open(npRead_, "w")
            ok = npRead.Write(out_file=fH, initialize=True)
            fH.close()
            if not ok:
                self.failStop(
                    "[SignalAlignment.run] File: %s did not pass initial checks"
                    % self.read_name, npRead)
                return False

        # nucleotide read
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        ok = self.write_nucleotide_read(npRead, read_fasta_)
        if not ok:
            print(
                "[SignalAlignment.run] Failed to write nucleotide read.  Continuing execution."
            )

        # alignment info
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        strand = None
        reference_name = None
        if not (self.check_for_temp_file_existance
                and os.path.isfile(cigar_file_)):

            # need guide alignment to generate cigar file
            guide_alignment = None

            # get from alignment file
            if self.alignment_file is not None:
                guide_alignment = getGuideAlignmentFromAlignmentFile(
                    self.alignment_file, read_name=read_label)
                if guide_alignment is None:
                    print(
                        "[SignalAlignment.run] read {} not found in {}".format(
                            read_label, self.alignment_file))

            # get from bwa
            if guide_alignment is None and self.bwa_reference is not None:
                guide_alignment = generateGuideAlignment(
                    reference_fasta=self.bwa_reference,
                    query=read_fasta_,
                    temp_sam_path=temp_samfile_,
                    target_regions=self.target_regions)
                if guide_alignment is None:
                    print(
                        "[SignalAlignment.run] read {} could not be aligned with BWA"
                        .format(read_label))

            # could not map
            if guide_alignment is None:
                self.failStop(
                    "[SignalAlignment.run] ERROR getting guide alignment",
                    npRead)
                return False

            # ensure valid
            if not guide_alignment.validate():
                self.failStop(
                    "[SignalAlignment.run] ERROR invalid guide alignment",
                    npRead)
                return False
            strand = guide_alignment.strand
            reference_name = guide_alignment.reference_name

            # write cigar to file
            cig_handle = open(cigar_file_, "w")
            cig_handle.write(guide_alignment.cigar + "\n")
            cig_handle.close()

        # otherwise, get strand from file
        else:
            strand, reference_name = getInfoFromCigarFile(cigar_file_)

        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        # forward strand
        if strand == "+":
            if self.output_format == "full":
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".forward.tsv")
            elif self.output_format == "variantCaller":
                posteriors_file_path = os.path.join(
                    self.destination, read_label + model_label + ".tsv")
            else:
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".assignments.tsv")

        # backward strand
        elif strand == "-":
            if self.output_format == "full":
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".backward.tsv")
            elif self.output_format == "variantCaller":
                posteriors_file_path = os.path.join(
                    self.destination, read_label + model_label + ".tsv")
            else:
                posteriors_file_path = os.path.join(
                    self.destination,
                    read_label + model_label + ".assignments.tsv")

        # sanity check
        else:
            self.failStop(
                "[SignalAlignment.run] ERROR Unexpected strand {}".format(
                    strand), npRead)
            return False

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(
                strand="template", version=npRead.version)
        if self.twoD_chemistry and self.in_complementHmm is None:
            pop1_complement = npRead.complement_model_id == "complement_median68pA_pop1.model"
            self.in_complementHmm = defaultModelFromVersion(
                strand="complement",
                version=npRead.version,
                pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run] ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalAlignment.run] NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm))

        # reference sequences
        assert os.path.isfile(self.forward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference)
        if self.backward_reference:
            assert os.path.isfile(self.backward_reference)
            backward_ref_flag = "-b {b_ref} ".format(
                b_ref=self.backward_reference)
        else:
            backward_ref_flag = ""

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in list(self.output_formats.keys()):
            self.failStop(
                "[SignalAlignment.run] ERROR illegal output format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        # twoD flag
        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""

        # commands
        if self.get_expectations:
            template_expectations_file_path = os.path.join(
                self.destination, read_label + ".template.expectations.tsv")
            complement_expectations_file_path = os.path.join(
                self.destination, read_label + ".complement.expectations.tsv")
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=self.path_to_signalMachine, model=stateMachineType_flag,
                            cigarFile=cigar_file_,
                            npRead=npRead_, readLabel=read_label, td=twoD_flag,
                            templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                            complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                            c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)
        else:
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=self.path_to_signalMachine, model=stateMachineType_flag, sparse=out_fmt,
                            cigarFile=cigar_file_,
                            readLabel=read_label, npRead=npRead_, td=twoD_flag,
                            t_model=template_model_flag, c_model=complement_model_flag,
                            posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)

        # run
        print("[SignalAlignment.run] running command: ", command, end="\n")
        try:
            command = command.split()

            if self.track_memory_usage:
                mem_command = ['/usr/bin/time', '-f', '\\nDEBUG_MAX_MEM:%M\\n']
                print(
                    "[SignalAlignment.run] Prepending command to track mem usage: {}"
                    .format(mem_command))
                mem_command.extend(command)
                command = mem_command

            output = subprocess.check_output(command, stderr=subprocess.STDOUT)
            output = str(output).split("\\n")
            for line in output:
                print("[SignalAlignment.run]    {}: {}".format(
                    read_label, line))
                if line.startswith("DEBUG_MAX_MEM"):
                    self.max_memory_usage_kb = int(line.split(":")[1])

        except Exception as e:
            print(
                "[SignalAlignment.run] exception ({}) running signalAlign: {}".
                format(type(e), e))
            raise e

        # save to fast5 file (if appropriate)
        if self.embed:
            print("[SignalAlignment.run] embedding into Fast5 ")

            data = self.read_in_signal_align_tsv(posteriors_file_path,
                                                 file_type=self.output_format)
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  twoD=self.twoD_chemistry,
                                  event_table=self.event_table)
            npRead.Initialize(None)
            signal_align_path = npRead.get_latest_basecall_edition(
                "/Analyses/SignalAlign_00{}", new=False)
            assert signal_align_path, "There is no path in Fast5 file: {}".format(
                "/Analyses/SignalAlign_00{}")
            output_path = npRead._join_path(signal_align_path,
                                            self.output_format)
            npRead.write_data(data, output_path)

            # Todo add attributes to signalalign output
            if self.output_format == "full":
                print(
                    "[SignalAlignment.run] writing maximum expected alignment "
                )
                alignment = mea_alignment_from_signal_align(None, events=data)
                mae_path = npRead._join_path(signal_align_path,
                                             "MEA_alignment_labels")
                events = npRead.get_template_events()
                if events:
                    if strand == "-":
                        minus = True
                    else:
                        minus = False
                    labels = match_events_with_signalalign(
                        sa_events=alignment,
                        event_detections=np.asanyarray(npRead.template_events),
                        minus=minus,
                        rna=npRead.is_read_rna())
                    npRead.write_data(labels, mae_path)
                    sam_string = str()
                    if os.path.isfile(temp_samfile_):
                        with open(temp_samfile_, 'r') as test:
                            for line in test:
                                sam_string += line
                    sam_path = npRead._join_path(signal_align_path, "sam")
                    # print(sam_string)
                    npRead.write_data(data=sam_string,
                                      location=sam_path,
                                      compression=None)

        # self.temp_folder.remove_folder()
        return True
def fast5_file_organization_service(work_queue,
                                    done_queue,
                                    output_dir_count,
                                    output_base,
                                    output_index_base,
                                    copy_files,
                                    service_name="fast5_file_organization"):
    # prep
    total_handled = 0
    failure_count = 0
    total_reads = 0
    name = current_process().name
    index_files = {}

    #catch overall exceptions
    try:

        # each thread and outputdir gets its own index file (start header for all of them)
        for i in range(output_dir_count):
            index_file = "{}{}_{}.tsv".format(output_index_base, i, name)
            write_header = not os.path.isfile(index_file)
            index_files[i] = open(index_file, 'a')
            if write_header:
                index_files[i].write("##{}:{}\n".format(
                    FAST5_ROOT, get_output_directory(output_base, i)))
                index_files[i].write("#{}\t{}\t{}\n".format(
                    FAST5_LOCATION, READ_ID, RUN_NAME))
        assert len(
            index_files
        ) == output_dir_count, "unexpected count of index files {} (expected {})".format(
            len(index_files), output_dir_count)

        idx = -1
        for f in iter(work_queue.get, 'STOP'):
            # randomly place files in appropriate directory
            idx = (idx + 1) % output_dir_count

            try:
                # sanity check
                assert 0 <= idx < output_dir_count

                # file organization
                destination_dir = get_output_directory(output_base, idx)
                source = f[FAST5_SRC_LOCATION]
                filename = os.path.basename(source)
                destination = os.path.join(destination_dir, filename)
                action = shutil.move
                if copy_files: action = shutil.copy

                # fast5 organization
                read = NanoporeRead(source)
                if not read._initialize_metadata():
                    failure_count += 1
                    continue
                read_id = read.read_label
                run_id = read.run_id
                assert None not in [
                    read_id, run_id
                ], "Missing read or run id for {}".format(source)

                # move or copy the file
                action(source, destination)

                # write the contents to the index
                index_files[idx].write("{}\t{}\t{}\n".format(
                    destination, read_id, run_id))

            except Exception as e:
                # get error and log it
                message = "{}:{}".format(type(e), str(e))
                error = "{} '{}' failed with: {}".format(
                    service_name,
                    current_process().name, message)
                print("[{}] ".format(service_name) + error)
                done_queue.put(error)
                failure_count += 1

            finally:
                # increment total handling
                total_handled += 1

    except Exception as e:
        # get error and log it
        message = "{}:{}".format(type(e), str(e))
        error = "{} '{}' critically failed with: {}".format(
            service_name,
            current_process().name, message)
        print("[{}] ".format(service_name) + error)
        done_queue.put(error)

    finally:
        # close all index files
        for index_file in index_files.values():
            if index_file is not None: index_file.close()

        # logging and final reporting
        print("[%s] '%s' completed %d calls with %d failures" %
              (service_name, current_process().name, total_handled,
               failure_count))
        done_queue.put("{}:{}".format(TOTAL_KEY, total_handled))
        done_queue.put("{}:{}".format(FAILURE_KEY, failure_count))
Example #12
0
def get_alignment_summary_info(fast5s, alignment_file, pass_threshold=7, gap_size=10, verbose=False,
                               max_reads=100, number=0):
    """Filter fast5 files based on a quality threhsold and if there is an alignment"""
    # collect for every read
    fast5_dict = defaultdict()
    # loop through fast5s
    for fast5_path in fast5s:
        assert os.path.exists(fast5_path), "fast5 path does not exist: {}".format(fast5_path)
        f5h = NanoporeRead(fast5_path)
        f5h._initialize_metadata()
        read_name = f5h.read_label
        fast5_dict[read_name] = fast5_path
    print("Created read_id to fast5_path mapping")
    # summary data stored here
    mapped_reads = get_summary_info_table(list(fast5_dict.keys()))
    # grab aligned segment
    seen_counter = 0
    reads_seen = set()
    print("first_len reads_seen: {}".format(len(reads_seen)), file=sys.stderr)

    with closing(pysam.AlignmentFile(alignment_file, 'rb' if alignment_file.endswith("bam") else 'r')) as aln:
        for aligned_segment in aln.fetch(until_eof=True):
            if seen_counter > max_reads:
                break
            try:
                print("reads_seen: {}".format(len(reads_seen)), file=sys.stderr)
                read_name = aligned_segment.qname.split("_")[0]
                fast5_path = fast5_dict[read_name]
                if read_name not in reads_seen:
                    reads_seen |= {read_name}
                    seen_counter += 1
                    mapped_reads["seen"][read_name] = 1
                    print(fast5_path)
                    cl_handle = CreateLabels(fast5_path, kmer_index=2)
                    seq_start_time = cl_handle.raw_attributes['start_time']
                    q_score_average = 0
                    if aligned_segment.query_qualities is None:
                        print("Alignment done with fasta instead of fastq so read qualities will not be reported")
                    else:
                        q_score_average = np.mean(aligned_segment.query_qualities)

                    mapped_reads["q_score_average"][read_name] = q_score_average
                    mapped_reads["seq_start_time"][read_name] = seq_start_time

                if aligned_segment.is_secondary or aligned_segment.is_unmapped \
                        or aligned_segment.is_supplementary or aligned_segment.has_tag("SA") \
                        or q_score_average < pass_threshold:

                    if aligned_segment.is_secondary:
                        mapped_reads["num_secondary_mappings"][read_name] += 1
                    if aligned_segment.is_unmapped:
                        mapped_reads["no_mapping"][read_name] = 1
                    if aligned_segment.is_supplementary or aligned_segment.has_tag("SA"):
                        mapped_reads["chimera_mapping"][read_name] += 1
                else:
                    mapped_reads["map_q"][read_name] = aligned_segment.mapq

                    soft_clipped_percentage = \
                        1 - float(len(aligned_segment.query_alignment_sequence)) / len(aligned_segment.query_sequence)
                    mapped_reads["soft_clipped_percentage"][read_name] = soft_clipped_percentage

                    handle = AlignmentSegmentWrapper(aligned_segment)
                    handle.initialize()

                    accuracy = handle.alignment_accuracy()
                    mapped_reads["basecalled_accuracy"][read_name] = accuracy
                    try:
                        mea = cl_handle.add_mea_labels(number=int(number))
                        sa_full = cl_handle.add_signal_align_predictions(number=int(number), add_basecall=True)
                        all_basecall_data = []
                        for name, basecall_data in cl_handle.aligned_signal.prediction.items():
                            if "guide" in name:
                                all_basecall_data.extend(basecall_data)

                        alignment_summary = analyze_event_skips(mea, sa_full, all_basecall_data, generate_plot=False)
                        flagged_gaps_summary = flag_large_gaps(alignment_summary, gap_size, verbose=verbose)
                        counter = 0
                        total_distance = 0
                        for gap in flagged_gaps_summary:
                            if gap["mea_peak_distance"] > 10:
                                counter += 1
                                total_distance += gap["mea_peak_distance"]
                        if counter > 0:
                            mapped_reads["num_flagged_gaps"][read_name] = counter
                            mapped_reads["avg_flagged_gap_size"][read_name] = float(total_distance) / counter

                        if mapped_reads["q_score_average"][read_name] > pass_threshold:
                            mapped_reads["pass"][read_name] = 1

                    except KeyError:
                        mapped_reads["other_errors"][read_name] = 1
            except Exception as e:
                print(e, file=sys.stderr)

        return mapped_reads[mapped_reads["seen"] == 1]
Example #13
0
def event_detection(work_queue,
                    done_queue,
                    alignment_file,
                    model_file_location,
                    event_detection_strategy=None,
                    event_detection_params=None,
                    tmp_directory=None,
                    write_failed_alignments=True,
                    service_name="event_detection"):
    # prep
    total_handled = 0
    failure_count = 0

    #catch overall exceptions
    try:
        for tmp in iter(work_queue.get, 'STOP'):
            # get data from iterator
            fast5, read_id = tmp['fast5']
            np_handle = None

            # catch exceptions on each element
            try:
                np_handle = NanoporeRead(fast5, initialize=False)
                success = load_from_raw(
                    np_handle,
                    alignment_file,
                    model_file_location,
                    write_failed_alignments=write_failed_alignments)
                if not success:
                    raise Exception(
                        "load_from_raw failed on read {} in {}".format(
                            read_id, fast5))

            except Exception as e:
                # get error and log it
                message = "{}:{}".format(type(e), str(e))
                error = "{} '{}' failed with: {}".format(
                    service_name,
                    current_process().name, message)
                print("[{}] ".format(service_name) + error)
                done_queue.put(error)
                failure_count += 1

            finally:
                if np_handle is not None: np_handle.close()

            # increment total handling
            total_handled += 1

    except Exception as e:
        # get error and log it
        message = "{}:{}".format(type(e), str(e))
        error = "{} '{}' critically failed with: {}".format(
            service_name,
            current_process().name, message)
        print("[{}] ".format(service_name) + error)
        done_queue.put(error)

    finally:
        # logging and final reporting
        print("[%s] '%s' completed %d calls with %d failures" %
              (service_name, current_process().name, total_handled,
               failure_count))
        done_queue.put("{}:{}".format(TOTAL_KEY, total_handled))
        done_queue.put("{}:{}".format(FAILURE_KEY, failure_count))
Example #14
0
    def run(self, get_expectations=False):
        print("[SignalAlignment.run]INFO: Starting on {read}".format(
            read=self.in_fast5),
              file=sys.stderr)
        if get_expectations:
            assert self.in_templateHmm is not None and self.in_complementHmm is not None,\
                "Need HMM files for model training"
        # file checks
        if os.path.isfile(self.in_fast5) is False:
            print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        self.openTempFolder("tempFiles_%s" % self.read_name)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        npRead = NanoporeRead(fast_five_file=self.in_fast5,
                              twoD=self.twoD_chemistry)
        fH = open(npRead_, "w")
        ok = npRead.Write(parent_job=None, out_file=fH, initialize=True)
        fH.close()
        if not ok:
            self.failStop(
                "[SignalAlignment.run]File: %s did not pass initial checks" %
                self.read_name, npRead)
            return False

        read_label = npRead.read_label  # use this to identify the read throughout
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        if self.twoD_chemistry:
            ok, version, pop1_complement = self.prepare_twod(
                nanopore_read=npRead, twod_read_path=read_fasta_)
        else:
            ok, version, _ = self.prepare_oned(nanopore_read=npRead,
                                               oned_read_path=read_fasta_)
            pop1_complement = None

        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        guide_alignment = generateGuideAlignment(
            bwa_index=self.bwa_index,
            query=read_fasta_,
            temp_sam_path=temp_samfile_,
            target_regions=self.target_regions)
        ok = guide_alignment.validate(self.reference_map.keys())
        if not ok:
            self.failStop("[SignalAlignment.run]ERROR getting guide alignment",
                          npRead)
            return False

        cig_handle = open(cigar_file_, "w")
        cig_handle.write(guide_alignment.cigar + "\n")
        cig_handle.close()

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        posteriors_file_path = ''
        # forward strand
        if guide_alignment.strand == "+":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # backward strand
        if guide_alignment.strand == "-":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # Alignment/Expectations routine
        path_to_signalAlign = "./signalMachine"

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(strand="template",
                                                          version=version)
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.in_complementHmm = defaultModelFromVersion(
                    strand="complement",
                    version=version,
                    pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalALignment.run]NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm),
            file=sys.stderr)

        # reference sequences
        assert self.reference_map[
            guide_alignment.reference_name]["forward"] is not None
        assert self.reference_map[
            guide_alignment.reference_name]["backward"] is not None
        forward_reference = self.reference_map[
            guide_alignment.reference_name]["forward"]
        backward_reference = self.reference_map[
            guide_alignment.reference_name]["backward"]
        assert os.path.isfile(forward_reference)
        assert os.path.isfile(backward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=forward_reference)
        backward_ref_flag = "-b {b_ref} ".format(b_ref=backward_reference)

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in self.output_formats.keys():
            self.failStop(
                "[SignalAlignment.run]ERROR illegal outpur format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""
        # commands
        if get_expectations:
            template_expectations_file_path = self.destination + read_label + ".template.expectations"
            complement_expectations_file_path = self.destination + read_label + ".complement.expectations"

            command = \
                "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations}"\
                .format(vA=path_to_signalAlign, model=stateMachineType_flag,
                        f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_,
                        npRead=npRead_, readLabel=read_label, td=twoD_flag,
                        templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                        complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                        c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                        trim=trim_flag, degen=degenerate_flag, sparse=out_fmt)
        else:
            print("read_label", read_label)
            command = \
                "{vA} {td} {degen}{sparse}{model}{f_ref}{b_ref} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel}"\
                .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt,
                        f_ref=forward_ref_flag, b_ref=backward_ref_flag, cigarFile=cigar_file_,
                        readLabel=read_label, npRead=npRead_, td=twoD_flag,
                        t_model=template_model_flag, c_model=complement_model_flag,
                        posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                        trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag)

        # run
        print("signalAlign - running command: ",
              command,
              end="\n",
              file=sys.stderr)
        os.system(command)
        self.temp_folder.remove_folder()
        return True
Example #15
0
    def run(self, get_expectations=False):
        print("[SignalAlignment.run]INFO: Starting on {read}".format(
            read=self.in_fast5),
              file=sys.stderr)
        if get_expectations:
            assert self.in_templateHmm is not None and self.in_complementHmm is not None, \
                "Need HMM files for model training"
        # file checks
        if os.path.isfile(self.in_fast5) is False:
            print("[SignalAlignment.run]ERROR: Did not find .fast5 at{file}".
                  format(file=self.in_fast5))
            return False

        self.openTempFolder("tempFiles_%s" % self.read_name)
        npRead_ = self.addTempFilePath("temp_%s.npRead" % self.read_name)
        # TODO is this totally f****d for RNA because of 3'-5' mapping?
        npRead = NanoporeRead(fast_five_file=self.in_fast5,
                              twoD=self.twoD_chemistry,
                              event_table=self.event_table)
        fH = open(npRead_, "w")
        ok = npRead.Write(parent_job=None, out_file=fH, initialize=True)
        fH.close()
        if not ok:
            self.failStop(
                "[SignalAlignment.run]File: %s did not pass initial checks" %
                self.read_name, npRead)
            return False

        read_label = npRead.read_label  # use this to identify the read throughout
        read_fasta_ = self.addTempFilePath("temp_seq_%s.fa" % read_label)
        temp_samfile_ = self.addTempFilePath("temp_sam_file_%s.sam" %
                                             read_label)
        cigar_file_ = self.addTempFilePath("temp_cigar_%s.txt" % read_label)
        if self.twoD_chemistry:
            ok, version, pop1_complement = self.prepare_twod(
                nanopore_read=npRead, twod_read_path=read_fasta_)
        else:
            ok, version, _ = self.prepare_oned(nanopore_read=npRead,
                                               oned_read_path=read_fasta_)
            pop1_complement = None
        # add an indicator for the model being used
        if self.stateMachineType == "threeState":
            model_label = ".sm"
            stateMachineType_flag = ""
        elif self.stateMachineType == "threeStateHdp":
            model_label = ".sm3Hdp"
            stateMachineType_flag = "--sm3Hdp "
            if self.twoD_chemistry:
                assert (self.in_templateHdp
                        is not None) and (self.in_complementHdp
                                          is not None), "Need to provide HDPs"
            else:
                assert self.in_templateHdp is not None, "Need to provide Template HDP"
        else:  # make invalid stateMachine control?
            model_label = ".sm"
            stateMachineType_flag = ""

        guide_alignment = generateGuideAlignment(
            bwa_index=self.bwa_index,
            query=read_fasta_,
            temp_sam_path=temp_samfile_,
            target_regions=self.target_regions)
        # ok = guide_alignment.validate(list(self.reference_map.keys()))
        ok = guide_alignment.validate()

        if not ok:
            self.failStop("[SignalAlignment.run]ERROR getting guide alignment",
                          npRead)
            return False

        cig_handle = open(cigar_file_, "w")
        cig_handle.write(guide_alignment.cigar + "\n")
        cig_handle.close()

        # next section makes the output file name with the format: /directory/for/files/file.model.orientation.tsv
        posteriors_file_path = ''
        # forward strand
        if guide_alignment.strand == "+":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".forward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # backward strand
        if guide_alignment.strand == "-":
            if self.output_format == "full":
                posteriors_file_path = self.destination + read_label + model_label + ".backward.tsv"
            elif self.output_format == "variantCaller":
                posteriors_file_path = self.destination + read_label + model_label + ".tsv"
            else:
                posteriors_file_path = self.destination + read_label + model_label + ".assignments"

        # Alignment/Expectations routine
        path_to_signalAlign = "./signalMachine"

        # flags

        # input (match) models
        if self.in_templateHmm is None:
            self.in_templateHmm = defaultModelFromVersion(strand="template",
                                                          version=version)
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.in_complementHmm = defaultModelFromVersion(
                    strand="complement",
                    version=version,
                    pop1_complement=pop1_complement)

        assert self.in_templateHmm is not None
        if self.twoD_chemistry:
            if self.in_complementHmm is None:
                self.failStop(
                    "[SignalAlignment.run]ERROR Need to have complement HMM for 2D analysis",
                    npRead)
                return False

        template_model_flag = "-T {} ".format(self.in_templateHmm)
        if self.twoD_chemistry:
            complement_model_flag = "-C {} ".format(self.in_complementHmm)
        else:
            complement_model_flag = ""

        print(
            "[SignalALignment.run]NOTICE: template model {t} complement model {c}"
            "".format(t=self.in_templateHmm, c=self.in_complementHmm),
            file=sys.stderr)

        # reference sequences
        assert os.path.isfile(self.forward_reference)
        forward_ref_flag = "-f {f_ref} ".format(f_ref=self.forward_reference)
        if self.backward_reference:
            assert os.path.isfile(self.backward_reference)
            backward_ref_flag = "-b {b_ref} ".format(
                b_ref=self.backward_reference)
        else:
            backward_ref_flag = ""

        # input HDPs
        if (self.in_templateHdp is not None) or (self.in_complementHdp
                                                 is not None):
            hdp_flags = "-v {tHdp_loc} ".format(tHdp_loc=self.in_templateHdp)
            if self.twoD_chemistry and self.in_complementHdp is not None:
                hdp_flags += "-w {cHdp_loc} ".format(
                    cHdp_loc=self.in_complementHdp)
        else:
            hdp_flags = ""

        # threshold
        if self.threshold is not None:
            threshold_flag = "-D {threshold} ".format(threshold=self.threshold)
        else:
            threshold_flag = ""

        # diagonal expansion
        if self.diagonal_expansion is not None:
            diag_expansion_flag = "-x {expansion} ".format(
                expansion=self.diagonal_expansion)
        else:
            diag_expansion_flag = ""

        # constraint trim
        if self.constraint_trim is not None:
            trim_flag = "-m {trim} ".format(trim=self.constraint_trim)
        else:
            trim_flag = ""

        # output format
        if self.output_format not in list(self.output_formats.keys()):
            self.failStop(
                "[SignalAlignment.run]ERROR illegal output format selected %s"
                % self.output_format)
            return False
        out_fmt = "-s {fmt} ".format(
            fmt=self.output_formats[self.output_format])

        # degenerate nucleotide information
        if self.degenerate is not None:
            degenerate_flag = "-o {} ".format(self.degenerate)
        else:
            degenerate_flag = ""

        if self.twoD_chemistry:
            twoD_flag = "--twoD"
        else:
            twoD_flag = ""
        # commands
        if get_expectations:
            template_expectations_file_path = self.destination + read_label + ".template.expectations"
            complement_expectations_file_path = self.destination + read_label + ".complement.expectations"

            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} {hdp}-L {readLabel} -p {cigarFile} " \
                "-t {templateExpectations} -c {complementExpectations} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=path_to_signalAlign, model=stateMachineType_flag,
                            cigarFile=cigar_file_,
                            npRead=npRead_, readLabel=read_label, td=twoD_flag,
                            templateExpectations=template_expectations_file_path, hdp=hdp_flags,
                            complementExpectations=complement_expectations_file_path, t_model=template_model_flag,
                            c_model=complement_model_flag, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, degen=degenerate_flag, sparse=out_fmt, seq_name=guide_alignment.reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)
        else:
            command = \
                "{vA} {td} {degen}{sparse}{model} -q {npRead} " \
                "{t_model}{c_model}{thresh}{expansion}{trim} -p {cigarFile} " \
                "-u {posteriors} {hdp}-L {readLabel} -n {seq_name} {f_ref_fa} {b_ref_fa}" \
                    .format(vA=path_to_signalAlign, model=stateMachineType_flag, sparse=out_fmt,
                            cigarFile=cigar_file_,
                            readLabel=read_label, npRead=npRead_, td=twoD_flag,
                            t_model=template_model_flag, c_model=complement_model_flag,
                            posteriors=posteriors_file_path, thresh=threshold_flag, expansion=diag_expansion_flag,
                            trim=trim_flag, hdp=hdp_flags, degen=degenerate_flag, seq_name=guide_alignment.reference_name,
                            f_ref_fa=forward_ref_flag, b_ref_fa=backward_ref_flag)

        # run
        print("signalAlign - running command: ",
              command,
              end="\n",
              file=sys.stderr)
        os.system(command)
        if self.embed:
            print("signalAlign - embedding into Fast5 ", file=sys.stderr)

            data = self.read_in_signal_align_tsv(posteriors_file_path,
                                                 file_type=self.output_format)
            npRead = NanoporeRead(fast_five_file=self.in_fast5,
                                  twoD=self.twoD_chemistry,
                                  event_table=self.event_table)
            npRead.Initialize(None)
            signal_align_path = npRead.get_latest_basecall_edition(
                "/Analyses/SignalAlign_00{}", new=False)
            assert signal_align_path, "There is no path in Fast5 file: {}".format(
                "/Analyses/SignalAlign_00{}")
            output_path = npRead._join_path(signal_align_path,
                                            self.output_format)
            npRead.write_data(data, output_path)

            # Todo add attributes to signalalign output
            if self.output_format == "full":
                print("signalAlign - writing maximum expected alignment ",
                      file=sys.stderr)
                alignment = mea_alignment_from_signal_align(None, events=data)
                mae_path = npRead._join_path(signal_align_path,
                                             "MEA_alignment_labels")
                events = npRead.get_template_events()
                if events:
                    if guide_alignment.strand == "-":
                        minus = True
                    else:
                        minus = False
                    labels = match_events_with_signalalign(
                        sa_events=alignment,
                        event_detections=np.asanyarray(npRead.template_events),
                        minus=minus,
                        rna=npRead.is_read_rna())
                    npRead.write_data(labels, mae_path)
                    sam_string = str()
                    with open(temp_samfile_, 'r') as test:
                        for line in test:
                            sam_string += line
                    sam_path = npRead._join_path(signal_align_path, "sam")
                    # print(sam_string)
                    npRead.write_data(data=sam_string,
                                      location=sam_path,
                                      compression=None)

        # self.temp_folder.remove_folder()
        return True
Example #16
0
    def get_data(self):
        """Calculate the normalized probability of variant for each nucleotide and across the read"""
        # final location of per position data and per read data
        data = []
        per_read_data = []
        if self.forward_mapped:
            mapping_strands = ["+", "-"]
        else:
            mapping_strands = ["-", "+"]

        if len(self.variant_data) > 0:
            kmer_len_1 = len(self.variant_data["reference_kmer"].iloc[0]) - 1
            mapping_index = 0
            for read_strand in ("t", "c"):
                read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand]
                # read_strand = read_strand.decode("utf-8")
                if len(read_strand_specifc_data) == 0:
                    continue
                # get positions on strand
                positions = sorted(set(read_strand_specifc_data["reference_index"]))

                if mapping_strands[mapping_index] == "-":
                    positions = positions[::-1]

                strand_read_nuc_data = [0] * len(self.variants)

                # marginalize probabilities for each position
                n_positions = 0
                for pos in positions:
                    pos_data = read_strand_specifc_data[read_strand_specifc_data["reference_index"] == pos]
                    if pos_data["aligned_kmer"].iloc[0][kmer_len_1] != "X":
                        continue
                    n_positions += 1
                    total_prob = 0
                    position_nuc_dict = {x: 0.0 for x in self.variants}
                    # Get total probability for each nucleotide
                    for nuc in self.variants:
                        # kmer_len_1 = pos_data["reference_kmer"].iloc[0].find("X")
                        # print(pos_data["reference_kmer"].iloc[0])
                        nuc_data = pos_data[[nuc == kmer[kmer_len_1] for kmer in pos_data["path_kmer"]]]
                        nuc_prob = sum(nuc_data["posterior_probability"])
                        total_prob += nuc_prob
                        position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob
                    # normalize probabilities over each position
                    nuc_data = [0] * len(self.variants)
                    for index, nuc in enumerate(self.variants):
                        assert total_prob > 0, "Check 'variants' parameter. There seems to be no kmers with those " \
                                               "variant characters"
                        nuc_data[index] = position_nuc_dict[nuc] / total_prob
                        strand_read_nuc_data[index] += nuc_data[index]
                    data.append(merge_lists([[self.read_name, self.contig, pos, read_strand,
                                              mapping_strands[mapping_index]], nuc_data]))
                if n_positions > 0:
                    per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand,
                                                       mapping_strands[mapping_index], n_positions],
                                                      [prob / n_positions for prob in strand_read_nuc_data]]))
                mapping_index += 1
            self.position_probs = pd.DataFrame(data, columns=self.columns)
            self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns)
            self.has_data = True

        else:
            self.has_data = False

        return self.position_probs