Beispiel #1
0
    def __init__(self, full_data, variants, read_name, forward_mapped):
        """Marginalize over all posterior probabilities to give a per position read probability
        :param variants: bases to track probabilities
        :param full_data: path to full tsv file

                             ['contig', 'reference_index',
                              'reference_kmer', 'read_file',
                              'strand', 'event_index',
                              'event_mean', 'event_noise',
                              'event_duration', 'aligned_kmer',
                              'scaled_mean_current', 'scaled_noise',
                              'posterior_probability', 'descaled_event_mean',
                              'ont_model_mean', 'path_kmer']
        """
        self.read_name = read_name
        self.full_data = full_data
        self.variant_data = self.full_data[["X" in kmer for kmer in self.full_data["reference_kmer"]]]
        self.variants = sorted(variants)
        self.forward_mapped = forward_mapped
        self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'],
                                    list(self.variants)])
        self.contig = NanoporeRead.bytes_to_string(self.full_data["contig"][0])
        self.position_probs = pd.DataFrame()
        self.has_data = False
        self.per_read_calls = pd.DataFrame()
        self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped", "n_sites"],
                                             list(self.variants)])
Beispiel #2
0
    def get_data(self):
        """Calculate the normalized probability of variant for each nucleotide and across the read"""
        # final location of per position data and per read data
        data = []
        per_read_data = []
        for read_strand in (b"t", b"c"):
            read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand]
            read_strand = read_strand.decode("utf-8")
            if len(read_strand_specifc_data) == 0:
                continue
            for forward_mapped in set(self.variant_data["forward_mapped"]):
                mapping_strand = "-"
                if forward_mapped == b"forward":
                    mapping_strand = "+"
                strand_specifc_data = read_strand_specifc_data[read_strand_specifc_data["forward_mapped"] ==
                                                               forward_mapped]
                if len(strand_specifc_data) == 0:
                    continue
                # get positions on strand
                positions = set(strand_specifc_data["reference_position"])
                n_positions = len(positions)
                strand_read_nuc_data = [0] * len(self.variants)

                # marginalize probabilities for each position
                for pos in positions:
                    pos_data = strand_specifc_data[strand_specifc_data["reference_position"] == pos]
                    total_prob = 0
                    position_nuc_dict = {x: 0.0 for x in self.variants}
                    # Get total probability for each nucleotide
                    for nuc in set(pos_data["base"]):
                        nuc_data = pos_data[pos_data["base"] == nuc]
                        nuc_prob = sum(nuc_data["posterior_probability"])
                        total_prob += nuc_prob
                        position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob
                    # normalize probabilities over each position
                    nuc_data = [0] * len(self.variants)
                    for nuc in position_nuc_dict.keys():
                        index = self.variants.index(nuc)
                        nuc_data[index] = position_nuc_dict[nuc] / total_prob
                        strand_read_nuc_data[index] += nuc_data[index]

                    data.append(merge_lists([[self.read_name, self.contig, pos, read_strand, mapping_strand],
                                             nuc_data]))
                if n_positions > 0:
                    per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand, mapping_strand,
                                                       n_positions],
                                                      [prob / n_positions for prob in strand_read_nuc_data]]))

            self.position_probs = pd.DataFrame(data, columns=self.columns)
            self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns)
            self.has_data = True

        return self.position_probs
Beispiel #3
0
def multiprocess_filter_reads(in_dir, alignment_file, readdb, trim=False,
                              quality_threshold=7, worker_count=1, debug=False):
    """Multiprocess for filtering reads but dont move the files
    :param in_dir: input directory with subdirectories assumed to have fast5s in them
    :param alignment_file: bam file
    :param readdb: readdb or sequence summary file
    :param trim: option to trim for x number of bases
    :param quality_threshold: quality threshold
    :param worker_count: number of workers to use
    :param debug: boolean option which will only use one process in order to fail if an error arises
    :return: True
    """
    assert alignment_file.endswith("bam"), "Alignment file must be in BAM format: {}".format(alignment_file)
    # grab aligned segment
    if debug:
        best_files = []
        for sub_in_dir in get_all_sub_directories(in_dir):
            best_files.extend(filter_reads(alignment_file, readdb, [sub_in_dir],
                                           quality_threshold=quality_threshold, trim=trim))
    else:
        filter_reads_args = {"readdb": readdb, "alignment_file": alignment_file,
                             "quality_threshold": quality_threshold, "trim": trim}
        total, failure, messages, output = multithread.run_service2(
            filter_read_service2, get_all_sub_directories(in_dir),
            filter_reads_args, ["in_dir"], worker_count)
        best_files = merge_lists(output)
    return best_files
Beispiel #4
0
def match_ref_position_with_raw_start_band(aggregate_reference_position, per_event_data):
    """Match up the reference position from aggregated probability table and the per event data"""
    final_data = []
    for position in aggregate_reference_position["position"]:

        # get the start and length of total number of events which had bases aligned to this position
        pos_data = per_event_data[per_event_data["reference_position"] == position]
        min_raw_start = min(pos_data["raw_start"])
        last_event_length = pos_data[pos_data["raw_start"] == max(pos_data["raw_start"])]["raw_length"][0]
        total_length = max(pos_data["raw_start"]) - min(pos_data["raw_start"]) + last_event_length
        final_data.append(
            merge_lists([aggregate_reference_position[aggregate_reference_position["position"] == position].values.tolist()[0],
                         [min_raw_start, total_length]]))
    final_data = pd.DataFrame(final_data, columns=merge_lists([aggregate_reference_position.columns,
                                                               ["raw_start", "raw_length"]]))
    return final_data
Beispiel #5
0
 def __init__(self, variant_data, variants, read_name):
     """Marginalize over all posterior probabilities to give a per position read probability
     :param variants: bases to track probabilities
     :param variant_data: variant data
     """
     self.read_name = read_name
     self.variant_data = variant_data
     self.variants = sorted(variants)
     self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'],
                                 list(self.variants)])
     self.contig = NanoporeRead.bytes_to_string(self.variant_data["contig"][0])
     self.position_probs = pd.DataFrame()
     self.has_data = False
     self.per_read_calls = pd.DataFrame()
     self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped",
                                           "n_sites"], list(self.variants)])
Beispiel #6
0
 def _normalize_all_data(self, all_data):
     """Helper function to normalize all probability data"""
     for strand in set(all_data["strand"]):
         strand_data = all_data[all_data["strand"] == strand]
         for contig in set(strand_data["contig"]):
             contig_data = strand_data[strand_data["contig"] == contig]
             for mapped_strand in set(contig_data["forward_mapped"]):
                 strand_mapped_data = contig_data[contig_data["forward_mapped"] == mapped_strand]
                 for position in set(strand_mapped_data["position"]):
                     position_data = strand_mapped_data[strand_mapped_data["position"] == position]
                     sum_total = sum(sum(position_data.loc[:, base]) for base in self.variants)
                     normalized_probs = [np.round(sum(position_data.loc[:, base]) / sum_total, 6) for base in
                                         self.variants]
                     yield merge_lists([[contig, position, strand, mapped_strand], normalized_probs])
Beispiel #7
0
 def __init__(self, variant_tsv_dir, variants="ATGC", verbose=False):
     """Marginalize over all posterior probabilities to give a per position read probability
     :param variant_tsv_dir: directory of variantCaller output from signalAlign
     :param variants: bases to track probabilities
     """
     self.variant_tsv_dir = variant_tsv_dir
     self.variants = sorted(variants)
     self.columns = merge_lists([['contig', 'position', 'strand', 'forward_mapped'], list(self.variants)])
     self.variant_tsvs = list_dir(self.variant_tsv_dir, ext=".vc.tsv")
     self.aggregate_position_probs = pd.DataFrame()
     self.per_position_data = pd.DataFrame()
     self.verbose = verbose
     self.per_read_data = pd.DataFrame()
     self.has_data = self._aggregate_all_variantcalls()
Beispiel #8
0
    def __init__(self, sa_full_tsv_dir, variants="ATGC", verbose=False, processes=2):
        """Marginalize over all posterior probabilities to give a per position read probability
        :param sa_full_tsv_dir: directory of full output from signalAlign
        :param variants: bases to track probabilities
        """
        self.sa_full_tsv_dir = sa_full_tsv_dir
        self.variants = sorted(variants)
        self.columns = merge_lists([['contig', 'position', 'strand', 'forward_mapped'], list(self.variants)])
        self.forward_tsvs = list_dir(self.sa_full_tsv_dir, ext=".forward.tsv")
        self.backward_tsvs = list_dir(self.sa_full_tsv_dir, ext=".backward.tsv")
        self.verbose = verbose
        self.worker_count = processes

        self.aggregate_position_probs = pd.DataFrame()
        self.per_position_data = pd.DataFrame()
        self.per_read_data = pd.DataFrame()
        self.has_data = self._multiprocess_aggregate_all_variantcalls()
Beispiel #9
0
 def get_covered_kmers(positions_data1, read_name1, ref_sequence1,
                       ref_name1, strand1, ref_start1, ref_end1):
     this_positions_data = positions_data1.loc[
         (positions_data1["chr"] == ref_name1)
         & (positions_data1["strand"] == strand1) &
         (positions_data1["start"] >= ref_start1) &
         (positions_data1["start"] <= ref_end1)]
     if this_positions_data.empty:
         return None
     kmer_lists = np.vectorize(get_kmer)(ref_sequence1,
                                         this_positions_data['start'],
                                         ref_start1, strand1,
                                         this_positions_data["replace"])
     kmer_subset_lists1 = merge_lists([[
         kmer[i:i + kmer_length] for i in range(kmer_length)
         if len(kmer[i:i + kmer_length]) == kmer_length
         and set(kmer[i:i + kmer_length]) <= set(alphabet)
     ] for kmer in kmer_lists])
     return read_name1, kmer_subset_lists1
Beispiel #10
0
    def __init__(self,
                 samples,
                 out_file_path,
                 template=True,
                 complement=False,
                 verbose=True):
        """
        Control how each kmer/event assignment is processed given a set of samples and the parameters associated with
        each sample

        :param samples:
        :param out_file:
        :param template: generate kmers for template read strand: default: True
        :param complement: generate kmers for complement read strand: default: True
        :param min_probability: the minimum probability to use for assigning kmers
        :param verbose: option to print update statements
        """
        self.strands = []
        if template:
            self.strands.append('t')
        if complement:
            self.strands.append('c')
        assert self.strands != [], 'template or complement need to be set to True. ' \
                                   'complement: {}, template: {}'.format(complement, template)

        for sample in samples:
            assert isinstance(sample, SignalAlignSample)

        self.canonical = "ATGC"
        self.samples = samples
        self.out_file_path = out_file_path
        self.template = template
        self.complement = complement
        self.verbose = verbose
        self.master_assignment_table = \
            make_master_assignment_table(sorted(merge_lists([sample.analysis_files for sample in self.samples])))
        self.k = len(self.master_assignment_table.iloc[0]['kmer'])
        self.n_assignments = len(self.master_assignment_table)
Beispiel #11
0
    def train_normal_hmm(self, transitions=True, emissions=False):
        """Train model transitions"""
        i = 0
        # start iterating
        while i < self.args.transitions_args.iterations:
            # align all the samples
            self.run_signal_align(
                get_expectations=True,
                trim=self.args.transitions_args.training_bases)
            all_sample_files = merge_lists(
                [sample.analysis_files for sample in self.samples])
            assert len(
                all_sample_files
            ) > 0, "Something failed in multithread signal alignment. We got no sample files"
            # load then normalize the expectations
            template_expectations_files = [
                x for x in all_sample_files
                if x.endswith(".template.expectations.tsv")
            ]

            if len(template_expectations_files) > 0:
                self.template_model.add_and_normalize_expectations(
                    files=template_expectations_files,
                    hmm_file=self.template_hmm_model_path,
                    update_transitions=transitions,
                    update_emissions=emissions)
            if self.two_d:
                complement_expectations_files = [
                    x for x in all_sample_files
                    if x.endswith(".complement.expectations.tsv")
                ]
                if len(complement_expectations_files) > 0:
                    self.complement_model.add_and_normalize_expectations(
                        files=complement_expectations_files,
                        hmm_file=self.complement_model_path,
                        update_transitions=transitions,
                        update_emissions=emissions)

            # log the running likelihood
            if len(self.template_model.running_likelihoods) > 0 and \
                    (self.two_d and len(self.complement_model.running_likelihoods)) > 0:
                print(
                    "[trainModels_transitions] {i}| {t_likelihood}\t{c_likelihood}"
                    .format(t_likelihood=self.template_model.
                            running_likelihoods[-1],
                            c_likelihood=self.complement_model.
                            running_likelihoods[-1],
                            i=i))
                if self.args.transitions_args.test and (len(self.template_model.running_likelihoods) >= 2) and \
                        (self.two_d and len(self.complement_model.running_likelihoods) >= 2):
                    assert (self.template_model.running_likelihoods[-2] < self.template_model.running_likelihoods[
                        -1]) and \
                           (self.complement_model.running_likelihoods[-2] < self.complement_model.running_likelihoods[
                               -1]), "Testing: Likelihood error, went up"
            elif len(self.template_model.running_likelihoods) > 0:
                print("[trainModels_transitions] {i}| {t_likelihood}".format(
                    t_likelihood=self.template_model.running_likelihoods[-1],
                    i=i))
                if self.args.transitions_args.test and (len(
                        self.template_model.running_likelihoods) >= 2):
                    assert (self.template_model.running_likelihoods[-2] <
                            self.template_model.running_likelihoods[-1]
                            ), "Testing: Likelihood error, went up"

            i += 1

        print(
            "[trainModels_transitions] - finished training transitions routine"
        )
        return self.template_hmm_model_path, self.complement_hmm_model_path
BED_PATHS = "/home/ubuntu/bisulfite_methylation_analysis/bisulfite_data"

REP1_CPG = os.path.join(BED_PATHS, "chr1_ENCFF279HCL.bed")
REP1_CHG = os.path.join(BED_PATHS, "chr1_ENCFF721BJM.bed")
REP1_CHH = os.path.join(BED_PATHS, "chr1_ENCFF448RTC.bed")
REP2_CPG = os.path.join(BED_PATHS, "chr1_ENCFF835NTC.bed")
REP2_CHG = os.path.join(BED_PATHS, "chr1_ENCFF349NNL.bed")
REP2_CHH = os.path.join(BED_PATHS, "chr1_ENCFF038HXQ.bed")

min_coverage = 10
delta = 6
nb_cpu = 10
percents = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
percents = [10, 20, 30, 40, 50, 60, 70, 80, 90]

all_percents = merge_lists([[x - 1, x, x + 1] for x in percents])

assert os.path.isdir(OUTPUT_DIR), "{} is not a directory".format(OUTPUT_DIR)
assert os.path.exists(REFERENCE), "{} does not exist".format(REFERENCE)
assert os.path.exists(REP1_CPG), "{} does not exist".format(REP1_CPG)
assert os.path.exists(REP1_CHG), "{} does not exist".format(REP1_CHG)
assert os.path.exists(REP1_CHH), "{} does not exist".format(REP1_CHH)
assert os.path.exists(REP2_CPG), "{} does not exist".format(REP2_CPG)
assert os.path.exists(REP2_CHG), "{} does not exist".format(REP2_CHG)
assert os.path.exists(REP2_CHH), "{} does not exist".format(REP2_CHH)

# reference handler and reverse complement handler
rh = ReferenceHandler(REFERENCE)
rc = ReverseComplement()
chromosome_strings = rh.fasta.references[:25]
chromosome_data = {
Beispiel #13
0
 def test_merge_lists(self):
     with captured_output() as (_, _):
         a = [[1, 2, 3], [4, 5, 6]]
         self.assertEqual(merge_lists(a), [1, 2, 3, 4, 5, 6])
Beispiel #14
0
def main():
    args = parse_args()
    assert os.path.isdir(args.output_dir), "{} is not a directory".format(
        args.output_dir)
    assert os.path.exists(args.bam), "{} does not exist".format(args.bam)
    assert os.path.exists(args.positions_file), "{} does not exist".format(
        args.positions_file)

    output_dir = args.output_dir
    bam = args.bam
    positions_file = args.positions_file
    reference = args.reference
    alphabet = args.alphabet
    kmer_length = args.kmer_length
    n_processes = args.threads
    # output_dir = "/home/ubuntu/mount/download/FAB39088"
    # bam = "/home/ubuntu/mount/download/FAB39088/fastq/canonical_cpg_FAB39088.2308.sorted.bam"
    # output_dir = "/home/ubuntu/mount/download/FAF01169"
    # bam = "/home/ubuntu/mount/download/FAF01169/Bham/fastq/canonical_cpg_FAF01169.2308.sorted.bam"
    #
    # positions_file = "/home/ubuntu/bisulfite_methylation_analysis/positions/canonical_added_cxx.positions"
    # reference = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa"
    # alphabet = "ACGT"
    # kmer_length = 6

    fasta_handle = None
    if reference is not None:
        assert os.path.exists(reference), "{} does not exist".format(reference)
        fasta_handle = ReferenceHandler(reference)

    rc = ReverseComplement()
    positions_data = pd.read_csv(
        positions_file,
        names=["chr", "start", "strand", "find", "replace"],
        sep="\t")
    km = KmerMap(alphabet, kmer_length)
    counter = 0

    def get_kmer(sequence, pos, start_pos, strand, replace):
        try:
            base = sequence[(pos - (kmer_length - 1)) -
                            start_pos:(pos + kmer_length) - start_pos]
            base = base[:(kmer_length - 1)] + replace + base[kmer_length:]
            if strand == "-":
                return rc.complement(base)
            return base
        except Exception as e:
            print(e, sequence, pos, start_pos)

    # def get_ref_base(chromosome, start_pos, strand):
    #     try:
    #         base = fasta_handle.get_sequence(chromosome_name=chromosome, start=start_pos, stop=start_pos + 1)
    #         if strand == "-":
    #             return rc.complement(base)
    #         return base
    #     except Exception as e:
    #         print(e, fasta_handle, chromosome, start_pos, strand)
    #
    # def get_base(sequence, pos, start_pos, reversed):
    #     try:
    #         base = sequence[pos - start_pos]
    #         if reversed:
    #             return rc.complement(base)
    #         return base
    #     except Exception as e:
    #         print(e, sequence, pos, start_pos)

    def get_covered_kmers(positions_data1, read_name1, ref_sequence1,
                          ref_name1, strand1, ref_start1, ref_end1):
        this_positions_data = positions_data1.loc[
            (positions_data1["chr"] == ref_name1)
            & (positions_data1["strand"] == strand1) &
            (positions_data1["start"] >= ref_start1) &
            (positions_data1["start"] <= ref_end1)]
        if this_positions_data.empty:
            return None
        kmer_lists = np.vectorize(get_kmer)(ref_sequence1,
                                            this_positions_data['start'],
                                            ref_start1, strand1,
                                            this_positions_data["replace"])
        kmer_subset_lists1 = merge_lists([[
            kmer[i:i + kmer_length] for i in range(kmer_length)
            if len(kmer[i:i + kmer_length]) == kmer_length
            and set(kmer[i:i + kmer_length]) <= set(alphabet)
        ] for kmer in kmer_lists])
        return read_name1, kmer_subset_lists1

    def meta_get_covered_kmers(positions, all_args1):
        data_to_return = []
        for args1 in all_args1:
            data = get_covered_kmers(positions, *args1)
            if data is not None:
                data_to_return.append(data)
        return data_to_return

    all_args = []
    with closing(pysam.AlignmentFile(
            bam, 'rb' if bam.endswith("bam") else 'r')) as aln:
        for aligned_segment in aln.fetch(until_eof=True):
            try:
                if not aligned_segment.has_tag('MD'):
                    if fasta_handle is None:
                        raise Exception(
                            "Need to specify --reference if MD flag is not set"
                        )
                    else:
                        ref_sequence = fasta_handle.get_sequence(
                            chromosome_name=aligned_segment.reference_name,
                            start=aligned_segment.reference_start,
                            stop=aligned_segment.reference_end)
                else:
                    ref_sequence = aligned_segment.get_reference_sequence(
                    ).upper()
                read_name = aligned_segment.qname.split("_")[0]
                ref_name = aligned_segment.reference_name
                ref_start = aligned_segment.reference_start
                ref_end = aligned_segment.reference_end
                reversed_read = aligned_segment.is_reverse
                if reversed_read:
                    strand = "-"
                else:
                    strand = "+"
                all_args.append([
                    read_name, ref_sequence, ref_name, strand, ref_start,
                    ref_end
                ])
                counter += 1
            except Exception as e:
                print(e, file=sys.stderr)

    print("starting on {} reads".format(len(all_args)))
    list_of_args = [all_args[x::n_processes] for x in range(n_processes)]
    # extra_args = {"positions": positions_data}
    # data = get_covered_kmers(positions_data, *list_of_args[0][0])
    # print(data)
    service = BasicService2(meta_get_covered_kmers,
                            positions_data,
                            service_name="multiprocess_meta_get_covered_kmers")
    total, failure, messages, output = run_service(service.run, list_of_args,
                                                   {}, ["all_args1"],
                                                   n_processes)
    # print(pd.concat(output, ignore_index=True))
    km = KmerMap(alphabet, kmer_length)

    all_data = merge_lists(output)
    print("number of reads: ", len(all_data))
    for read_name, kmer_subset_lists in all_data:
        # print(read_name, kmer_subset_lists)
        r = Read(read_name)
        for kmer in kmer_subset_lists:
            r.add_kmer(kmer)
        km.add_read(r)

    kmer_counts_file_path = os.path.join(output_dir,
                                         "all_reads_kmer_counts.txt")
    with open(kmer_counts_file_path, "w") as fh:
        print("\n".join([
            "\t".join([kmer, str(count)])
            for kmer, count in km.kmer_counts.items()
        ]),
              file=fh)

    keep_kmer_map = KmerMap(alphabet, kmer_length)

    print("number of zero covered kmers: ", len(km.get_zero_kmers()))
    curr_threshold = 1
    iteration = 0
    increase_threshold = True
    while increase_threshold:
        curr_threshold += 1
        find_kmers = keep_kmer_map.get_threshold_uncovered_kmers(
            threshold=curr_threshold)
        while len(find_kmers) > 0:
            print(iteration, len(find_kmers))
            next_kmer = km.get_non_zero_min_kmer_in_kmers(find_kmers)
            if next_kmer is None:
                print(
                    "No more reads to cover found kmers: threshold {}".format(
                        curr_threshold))
                increase_threshold = True
                if curr_threshold >= 10:
                    increase_threshold = False
                break
            next_read_index, next_read = km.get_read(next_kmer)
            if next_read is None:
                print("Whoops, something is wrong")
                break
            keep_kmer_map.add_read(next_read)
            km.remove_read(next_read_index)
            find_kmers = keep_kmer_map.get_threshold_uncovered_kmers(
                threshold=curr_threshold)
            iteration += 1
        print("Exited first while")
        if len(find_kmers) == 0:
            print("Found reads covering all kmers at threshold {}".format(
                curr_threshold))
        file_path = os.path.join(
            output_dir, "{}_reads_covering_kmers_with_threshold_{}.txt".format(
                "all" if increase_threshold else "some", curr_threshold))
        with open(file_path, "w") as fh:
            print("\n".join([read.read_id for read in keep_kmer_map.reads]),
                  file=fh)
        kmer_counts_file_path = os.path.join(
            output_dir, "{}_kmer_counts_with_threshold_{}.txt".format(
                "all" if increase_threshold else "some", curr_threshold))
        with open(kmer_counts_file_path, "w") as fh:
            print("\n".join([
                "\t".join([kmer, str(count)])
                for kmer, count in keep_kmer_map.kmer_counts.items()
            ]),
                  file=fh)
Beispiel #15
0
    def get_data(self):
        """Calculate the normalized probability of variant for each nucleotide and across the read"""
        # final location of per position data and per read data
        data = []
        per_read_data = []
        if self.forward_mapped:
            mapping_strands = ["+", "-"]
        else:
            mapping_strands = ["-", "+"]

        if len(self.variant_data) > 0:
            kmer_len_1 = len(self.variant_data["reference_kmer"].iloc[0]) - 1
            mapping_index = 0
            for read_strand in ("t", "c"):
                read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand]
                # read_strand = read_strand.decode("utf-8")
                if len(read_strand_specifc_data) == 0:
                    continue
                # get positions on strand
                positions = sorted(set(read_strand_specifc_data["reference_index"]))

                if mapping_strands[mapping_index] == "-":
                    positions = positions[::-1]

                strand_read_nuc_data = [0] * len(self.variants)

                # marginalize probabilities for each position
                n_positions = 0
                for pos in positions:
                    pos_data = read_strand_specifc_data[read_strand_specifc_data["reference_index"] == pos]
                    if pos_data["aligned_kmer"].iloc[0][kmer_len_1] != "X":
                        continue
                    n_positions += 1
                    total_prob = 0
                    position_nuc_dict = {x: 0.0 for x in self.variants}
                    # Get total probability for each nucleotide
                    for nuc in self.variants:
                        # kmer_len_1 = pos_data["reference_kmer"].iloc[0].find("X")
                        # print(pos_data["reference_kmer"].iloc[0])
                        nuc_data = pos_data[[nuc == kmer[kmer_len_1] for kmer in pos_data["path_kmer"]]]
                        nuc_prob = sum(nuc_data["posterior_probability"])
                        total_prob += nuc_prob
                        position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob
                    # normalize probabilities over each position
                    nuc_data = [0] * len(self.variants)
                    for index, nuc in enumerate(self.variants):
                        assert total_prob > 0, "Check 'variants' parameter. There seems to be no kmers with those " \
                                               "variant characters"
                        nuc_data[index] = position_nuc_dict[nuc] / total_prob
                        strand_read_nuc_data[index] += nuc_data[index]
                    data.append(merge_lists([[self.read_name, self.contig, pos, read_strand,
                                              mapping_strands[mapping_index]], nuc_data]))
                if n_positions > 0:
                    per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand,
                                                       mapping_strands[mapping_index], n_positions],
                                                      [prob / n_positions for prob in strand_read_nuc_data]]))
                mapping_index += 1
            self.position_probs = pd.DataFrame(data, columns=self.columns)
            self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns)
            self.has_data = True

        else:
            self.has_data = False

        return self.position_probs