def test_rt_reaction_counting_HAMMING_0(self):
        # This is a dictionary containing molecules and the amount of rt reactions for location chr1:164834865

        f = pysam.AlignmentFile('./data/mini_nla_test.bam')
        it = singlecellmultiomics.molecule.MoleculeIterator(
            alignments=f,
            molecule_class=singlecellmultiomics.molecule.Molecule,
            fragment_class=singlecellmultiomics.fragment.NlaIIIFragment,
            fragment_class_args={'umi_hamming_distance': 1})

        hand_curated_truth = {
            # hd: 0:
            # hard example with N in random primer and N in UMI:
            'APKS2-P8-2-2_52': {
                'rt_count': 3
            },

            # Simple examples:
            'APKS2-P18-1-1_318': {
                'rt_count': 1
            },
            'APKS2-P18-1-1_369': {
                'rt_count': 2
            },
            'APKS2-P18-2-1_66': {
                'rt_count': 1
            },
            'APKS2-P18-2-1_76': {
                'rt_count': 1
            },
            'APKS2-P18-2-1_76': {
                'rt_count': 1
            },
        }

        obtained_rt_count = {}
        for molecule in it:
            site = molecule.get_cut_site()
            if site is not None and site[1] == 164834865:
                obtained_rt_count[molecule.get_sample()] = len(
                    molecule.get_rt_reaction_fragment_sizes(max_N_distance=0))

        # Validate:
        for sample, truth in hand_curated_truth.items():
            self.assertEqual(obtained_rt_count.get(sample, 0),
                             truth.get('rt_count', -1))
 def test_get_match_mismatch_frequency(self):
     """Test if the matches and mismatches of reads in a molecule are counted properly"""
     with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
         it = singlecellmultiomics.molecule.MoleculeIterator(
             alignments=f,
             molecule_class=singlecellmultiomics.molecule.Molecule,
             fragment_class_args={
                 'R1_primer_length': 0,
                 'R2_primer_length': 0,
             },
             fragment_class=singlecellmultiomics.fragment.NlaIIIFragment)
         for molecule in iter(it):
             #print(molecule.get_sample())
             if molecule.get_sample() == 'APKS3-P19-1-1_91':
                 break
     #print(molecule.get_match_mismatch_frequency())
     self.assertEqual((628, 13), molecule.get_match_mismatch_frequency())
    def test_get_consensus_gc_ratio(self):
        """Test if the gc ratio of a molecule is properly determined"""
        with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
            it = singlecellmultiomics.molecule.MoleculeIterator(
                alignments=f,
                molecule_class=singlecellmultiomics.molecule.Molecule,
                fragment_class=singlecellmultiomics.fragment.Fragment,
                fragment_class_args={
                    'R1_primer_length': 0,
                    'R2_primer_length': 0,
                })
            for molecule in iter(it):
                #print(molecule.get_sample())
                if molecule.get_sample() == 'APKS3-P19-1-1_91':
                    break

        self.assertAlmostEqual(0.23113207547169812,
                               molecule.get_consensus_gc_ratio())
    def _pool_test(self, pooling_method=0, hd=0):
        for sample in ['AP1-P22-1-1_318', 'APKS2-P8-2-2_52']:

            f = pysam.AlignmentFile('./data/mini_nla_test.bam')
            it = singlecellmultiomics.molecule.MoleculeIterator(
                alignments=f,
                molecule_class=singlecellmultiomics.molecule.Molecule,
                fragment_class=singlecellmultiomics.fragment.NlaIIIFragment,
                fragment_class_args={'umi_hamming_distance': hd},
                pooling_method=pooling_method)

            molecule_count = 0
            for molecule in iter(it):
                if molecule.get_sample() == sample:
                    molecule_count += 1
            if hd == 0:
                # it has one fragment with a sequencing error in the UMI
                self.assertEqual(molecule_count, 2)
            else:
                self.assertEqual(molecule_count, 1)
    def test_consensus(self):
        """Test if a right consensus sequence can be produced from a noisy molecule"""
        with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
            it = singlecellmultiomics.molecule.MoleculeIterator(
                alignments=f,
                molecule_class=singlecellmultiomics.molecule.Molecule,
                fragment_class=singlecellmultiomics.fragment.NlaIIIFragment,
                fragment_class_args={
                    'R1_primer_length': 0,
                    'R2_primer_length': 6,
                })
            for molecule in iter(it):
                #print(molecule.get_sample())
                if molecule.get_sample() == 'APKS3-P19-1-1_91':
                    break

            self.assertEqual(
                ''.join(list(molecule.get_consensus().values())),
                'CATGAGTTAGATATGGACTCTTCTTCAGACACTTTGTTTAAATTTTAAATTTTTTTCTGATTGCATATTACTAAAAATGTGTTATGAATATTTTCCATATCATTAAACATTCTTCTCAAGCATAACTTTAAATAACTGCATTATAGAAAATTTACGCTACTTTTGTTTTTGTTTTTTTTTTTTTTTTTTTACTATTATTAATAACACGGTGG'
            )
def count_transcripts(cargs):
    args, contig = cargs
    if args.alleles is not None:
        allele_resolver = alleleTools.AlleleResolver(
            args.alleles, lazyLoad=(not args.loadAllelesToMem))
    else:
        allele_resolver = None

    contig_mapping = None

    if args.contigmapping == 'danio':
        contig_mapping = {
            '1': 'CM002885.2',
            '2': 'CM002886.2',
            '3': 'CM002887.2',
            '4': 'CM002888.2',
            '5': 'CM002889.2',

            '6': 'CM002890.2',
            '7': 'CM002891.2',
            '8': 'CM002892.2',
            '9': 'CM002893.2',
            '10': 'CM002894.2',
            '11': 'CM002895.2',
            '12': 'CM002896.2',
            '13': 'CM002897.2',
            '14': 'CM002898.2',
            '15': 'CM002899.2',

            '16': 'CM002900.2',
            '17': 'CM002901.2',
            '18': 'CM002902.2',
            '19': 'CM002903.2',
            '20': 'CM002904.2',
            '21': 'CM002905.2',
            '22': 'CM002906.2',
            '23': 'CM002907.2',
            '24': 'CM002908.2',
            '25': 'CM002909.2',
        }

    # Load features
    contig_mapping = None
    #conversion_table = get_gene_id_to_gene_name_conversion_table(args.gtfexon)
    features = singlecellmultiomics.features.FeatureContainer()
    if contig_mapping is not None:
        features.remapKeys = contig_mapping
    features.loadGTF(
        args.gtfexon,
        select_feature_type=['exon'],
        identifierFields=(
            'exon_id',
            'transcript_id'),
        store_all=True,
        head=args.hf,
        contig=contig)
    features.loadGTF(
        args.gtfintron,
        select_feature_type=['intron'],
        identifierFields=['transcript_id'],
        store_all=True,
        head=args.hf,
        contig=contig)

    # What is used for assignment of molecules?
    if args.method == 'nla':
        moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment
        pooling_method = 1  # all data from the same cell can be dealt with separately
        stranded = None  # data is not stranded
    elif args.method == 'vasa' or args.method == 'cs':
        moleculeClass = singlecellmultiomics.molecule.VASA
        fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript
        pooling_method = 1
        stranded = 1  # data is stranded, mapping to other strand
    else:
        raise ValueError("Supply a valid method")

    # COUNT:
    exon_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    intron_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    junction_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    gene_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount

    gene_set = set()
    sample_set = set()
    annotated_molecules = 0
    read_molecules = 0
    if args.producebam:
        bam_path_produced = f'{args.o}/output_bam_{contig}.unsorted.bam'
        with pysam.AlignmentFile(args.alignmentfiles[0]) as alignments:
            output_bam = pysam.AlignmentFile(
                bam_path_produced, "wb", header=alignments.header)

    ref = None
    if args.ref is not None:
        ref = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref))

    for alignmentfile_path in args.alignmentfiles:

        i = 0
        with pysam.AlignmentFile(alignmentfile_path) as alignments:
            molecule_iterator = MoleculeIterator(
                alignments=alignments,
                check_eject_every=5000,
                moleculeClass=moleculeClass,
                molecule_class_args={
                    'features': features,
                    'stranded': stranded,
                    'min_max_mapping_quality': args.minmq,
                    'reference': ref,
                    'allele_resolver': allele_resolver
                },

                fragmentClass=fragmentClass,
                fragment_class_args={
                    'umi_hamming_distance': args.umi_hamming_distance,
                    'R1_primer_length': 4,
                    'R2_primer_length': 6},
                perform_qflag=True,
                # when the reads have not been tagged yet, this flag is very
                # much required
                pooling_method=pooling_method,
                contig=contig
            )

            for i, molecule in enumerate(molecule_iterator):
                if not molecule.is_valid():
                    if args.producebam:
                        molecule.write_tags()
                        molecule.write_pysam(output_bam)
                    continue

                molecule.annotate(args.annotmethod)
                molecule.set_intron_exon_features()

                if args.producebam:
                    molecule.write_tags()
                    molecule.write_pysam(output_bam)

                allele = None
                if allele_resolver is not None:
                    allele = molecule.allele
                    if allele is None:
                        allele = 'noAllele'

                # Obtain total count introns/exons reduce it so the sum of the
                # count will be 1:
                # len(molecule.introns.union( molecule.exons).difference(molecule.junctions))+len(molecule.junctions)
                total_count_for_molecule = len(molecule.genes)
                if total_count_for_molecule == 0:
                    continue  # we didn't find  any gene counts

                # Distibute count over amount of gene hits:
                count_to_add = 1 / total_count_for_molecule
                for gene in molecule.genes:
                    if allele is not None:
                        gene = f'{allele}_{gene}'
                    gene_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)
                    sample_set.add(molecule.get_sample())

                # Obtain introns/exons/splice junction information:
                for intron in molecule.introns:
                    gene = intron
                    if allele is not None:
                        gene = f'{allele}_{intron}'
                    intron_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                for exon in molecule.exons:
                    gene = exon
                    if allele is not None:
                        gene = f'{allele}_{exon}'
                    exon_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                for junction in molecule.junctions:
                    gene = junction
                    if allele is not None:
                        gene = f'{allele}_{junction}'
                    junction_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                annotated_molecules += 1
                if args.head and (i + 1) > args.head:
                    print(
                        f"-head was supplied, {i} molecules discovered, stopping")
                    break

        read_molecules += i

    if args.producebam:
        output_bam.close()
        final_bam_path = bam_path_produced.replace('.unsorted', '')
        sort_and_index(bam_path_produced, final_bam_path, remove_unsorted=True)

    return (
        gene_set,
        sample_set,
        gene_counts_per_cell,
        junction_counts_per_cell,
        exon_counts_per_cell,
        intron_counts_per_cell,
        annotated_molecules,
        read_molecules,
        contig

    )
                    genomeString.append(call['reference_base'])
                else:
                    unmethylated_hits += 1
# NEED TO REMOVE THIS CODE ENTIRELY!! <- VB
                if args.table is not None:
                    for binIdx in singlecellmultiomics.utils.coordinate_to_bins(
                            location, args.bin_size, args.sliding_increment):
                        bin_start, bin_end = binIdx
                        if bin_start < 0 or bin_end > ref_lengths[
                                molecule.chromosome]:
                            continue

                        if args.stranded:
                            binned_data[(chromosome,
                                         molecule.get_strand_repr(),
                                         binIdx)][molecule.get_sample()][
                                             call['context'].isupper()] += 1
                            cell_count[molecule.get_sample()] += 1
                        else:
                            binned_data[(chromosome,
                                         binIdx)][molecule.get_sample()][
                                             call['context'].isupper()] += 1
                            cell_count[molecule.get_sample()] += 1
###
                if args.bed is not None:
                    # Skip non-selected contexts only for table
                    if contexts is not None and call['context'] not in contexts:
                        continue
                    else:
                        # name = cell barcode + context
                        name = ":".join(
Exemple #8
0
def obtain_variant_statistics(alignment_file_paths, cell_obs, statistics,
                              cell_call_data, reference, chromosome,
                              ssnv_position, gsnv_position, haplotype_scores,
                              WINDOW_RADIUS, out, min_read_obs, read_groups,
                              umi_hamming_distance, args):
    """
    Obtain statistics from known gsnv-phased variant location

    Args:
        alignment_file_paths (list) : List of handles to Pysam.Alignment files from which to extract molecules

        cell_obs ( collections.defaultdict(lambda: collections.defaultdict( collections.Counter) ) )

        statistics ( collections.defaultdict(lambda: collections.defaultdict( collections.Counter) ) )

        cell_call_data(collections.defaultdict(dict) )

        haplotype_scores(dict)

        reference (pysamiterators.CachedFasta)

        chromosome (str)

        ssnv_position (int) : zero based ssnv position

        gsnv_position (int) : zero based gsnv position

        WINDOW_RADIUS(int)

        out(pysam.AlignmentFile )

        min_read_obs(int)

        read_groups(set)

        umi_hamming_distance(int)

        args


    """

    sSNV_ref_base = reference.fetch(chromosome, ssnv_position,
                                    ssnv_position + 1)
    gSNV_ref_base = reference.fetch(chromosome, gsnv_position,
                                    gsnv_position + 1)

    window_molecules = []

    cell_read_obs = collections.defaultdict(
        collections.Counter)  # sample -> tuple -> amount of reads

    region_start = ssnv_position - WINDOW_RADIUS
    region_end = ssnv_position + WINDOW_RADIUS

    for pathi, alignment_path in enumerate(alignment_file_paths):

        # Perform re-alignment:
        if args.realign:
            target_bam = f'align_{chromosome}_{region_start}_{region_end}.bam'

            if not os.path.exists(target_bam):
                temp_target_bam = f'{target_bam}.temp.bam'
                temp_target_bai = f'{target_bam}.temp.bai'
                GATK_indel_realign(
                    alignment_path,
                    temp_target_bam,
                    chromosome,
                    region_start,
                    region_end,
                    args.indelvcf,
                    gatk_path=args.gatk3_path,
                    interval_path=None,
                    java_cmd=
                    f'java -jar -Xmx{args.realign_mem}G -Djava.io.tmpdir=./gatk_tmp',
                    reference=reference.handle.filename.decode('utf8'),
                    interval_write_path=
                    f'./align_{chromosome}_{region_start}_{region_end}.intervals'
                )

                print(f'Renaming {temp_target_bam} > {target_bam}')
                os.rename(temp_target_bam, target_bam)
                os.rename(temp_target_bai, target_bam.replace('.bam', '.bai'))
            alignment_path = target_bam

        with pysam.AlignmentFile(
                alignment_path,
                ignore_truncation=args.ignore_bam_issues) as alignments:

            for molecule_id, molecule in enumerate(
                    singlecellmultiomics.molecule.MoleculeIterator(
                        alignments,
                        fragment_class_args={
                            'umi_hamming_distance': umi_hamming_distance,
                        },
                        molecule_class_args={'reference': reference},
                        moleculeClass=singlecellmultiomics.molecule.
                        NlaIIIMolecule,
                        fragmentClass=singlecellmultiomics.fragment.
                        NLAIIIFragment,
                        start=ssnv_position - WINDOW_RADIUS,
                        end=ssnv_position + WINDOW_RADIUS,
                        contig=chromosome)):

                # For every molecule obtain the consensus from which to extract
                # the gSNV and sSNV:
                try:
                    consensus = molecule.get_consensus()
                except Exception as e:
                    if str(
                            e
                    ) == 'Could not extract any safe data from molecule':
                        statistics[(chromosome,
                                    ssnv_position)]['R2_unmapped'][True] += 1
                    else:
                        print(e)
                    continue

                # Extract the gSNV and sSNV:
                ssnv_state = consensus.get((chromosome, ssnv_position))
                gsnv_state = consensus.get((chromosome, gsnv_position))

                # Store all used molecules in the window for inspection:
                window_molecules.append((molecule, ssnv_state, gsnv_state))

                # If both the ssnv and gsnv are none there is no information we
                # can use.
                if ssnv_state is None and gsnv_state is None:
                    continue

                # Store the observation
                # the amount of reads of evidence is len(molecule)
                cell_obs[(chromosome, ssnv_position)][molecule.get_sample()][(
                    ssnv_state, gsnv_state)] += 1
                cell_read_obs[molecule.get_sample()][(
                    ssnv_state, gsnv_state)] += len(molecule)

                # Store statistics
                statistics[(chromosome, ssnv_position)]['max_mapping_quality'][
                    molecule.get_max_mapping_qual()] += 1
                statistics[(chromosome, ssnv_position)]['fragment_size'][
                    molecule.get_safely_aligned_length()] += 1
                statistics[(chromosome, ssnv_position)]['ivt_dups'][len(
                    molecule.get_rt_reactions())] += 1
                statistics[(chromosome, ssnv_position)]['undigested'][
                    molecule.get_undigested_site_count()] += 1
                statistics[(chromosome,
                            ssnv_position)]['reads'][len(molecule)] += 1
                statistics[(chromosome, ssnv_position)]['molecules'][1] += 1

                # Store alignment statistics:
                for operation, per_bp in molecule.get_alignment_stats().items(
                ):
                    statistics[(chromosome,
                                ssnv_position)][operation][per_bp] += 1

                try:
                    statistics[(chromosome, ssnv_position)]['ssnv_ref_phred'][
                        molecule.get_mean_base_quality(chromosome,
                                                       ssnv_position,
                                                       sSNV_ref_base)] += 1
                except BaseException:
                    pass

                try:
                    statistics[(chromosome, ssnv_position)]['ssnv_alt_phred'][
                        molecule.get_mean_base_quality(
                            chromosome, ssnv_position,
                            not_base=sSNV_ref_base)] += 1
                except BaseException:
                    pass

                try:
                    statistics[(chromosome, ssnv_position)]['gsnv_ref_phred'][
                        molecule.get_mean_base_quality(chromosome,
                                                       gsnv_position,
                                                       gSNV_ref_base)] += 1
                except BaseException:
                    pass

                try:
                    statistics[(chromosome,
                                ssnv_position)]['gsnv_any_alt_phred'][
                                    molecule.get_mean_base_quality(
                                        chromosome,
                                        gsnv_position,
                                        not_base=gSNV_ref_base)] += 1
                except BaseException:
                    pass

    # After finishing iteration over all molecules assign genotypes
    chrom, pos = chromosome, ssnv_position
    obs_for_cells = cell_obs[(chrom, pos)]

    sSNV_alt_base = None
    gSNV_alt_base = None

    genotype_obs = collections.Counter()
    complete_genotype_obs = collections.Counter()
    sSNV_obs_phased = collections.Counter()
    gSNV_obs_phased = collections.Counter()

    sSNV_obs = collections.Counter()
    gSNV_obs = collections.Counter()

    for cell, cell_data in obs_for_cells.items():
        for ssnv, gsnv in cell_data:
            genotype_obs[(ssnv, gsnv)] += 1

            gSNV_obs[gsnv] += 1
            sSNV_obs[ssnv] += 1

            if ssnv is not None and gsnv is not None:
                complete_genotype_obs[(ssnv, gsnv)] += 1
                # Only count these when the germline variant is detected
                gSNV_obs_phased[gsnv] += 1
                sSNV_obs_phased[ssnv] += 1

    print(Style.BRIGHT + f'Genotype observations for variant {chrom}:{pos}' +
          Style.RESET_ALL)
    print('som\tgerm\tobs')
    for (ssnv, gsnv), obs in complete_genotype_obs.most_common():
        print(f' {ssnv}\t{gsnv}\t{obs}')

    if len(complete_genotype_obs) <= 2:
        print(f'not enough genotype observations for a variant call (<=2)')

    ### Conbase algorithm : ###
    #
    # determine if there is an alternative base in the first place
    # a fraction of the reads in a cell need to vote for a tuple,
    # this fraction is stored in the alpha parameter , or a minimum amount of reads, stored in the beta parameter
    # determine tp*, the alleles we expect observe
    # ϴ τ α γ κ λ ν ξ ρ ϕ
    α = 0.2  # minimum relative abundance of sSNV voting reads in single sample
    β = 3  # minimum amount of sSSNV reads in cell, or in total if α is exceeded
    γ = 0.9  # minimum amount of votes for sSNV
    ε = 2  # minimum amount of cells voting for sSNV
    ω = 0.9  # gsnv majority

    sSNV_votes = collections.Counter()  # { sSNV_alt_base : votes }
    total_samples_which_voted = 0
    for sample, observed_tuples in cell_read_obs.items():
        # First we create a Counter just counting the amount of evidence per
        # base for this sample :
        evidence_total_reads = collections.Counter()
        total_reads = 0
        for (sSNV_state, gSNV_state), reads in observed_tuples.most_common():
            if sSNV_state is None:
                continue
            evidence_total_reads[sSNV_state] += reads
            total_reads += reads

        # this is the amount of reads which contain evidence for the reference
        # base
        ref_sSNV_reads = evidence_total_reads[sSNV_ref_base]
        votes_for_this_sample = set(
        )  # the alternative bases this sample votes for
        for sSNV_state, sSNV_supporting_reads in evidence_total_reads.most_common(
        ):
            # The reference base does not vote.
            if sSNV_state == sSNV_ref_base or sSNV_state is None:
                continue

            # check if at least alpha reads vote for the sSNV
            alpha_value = 0 if ref_sSNV_reads == 0 else sSNV_supporting_reads / ref_sSNV_reads

            vote = (1 if (alpha_value >= α and
                          (sSNV_supporting_reads + ref_sSNV_reads) >= β) or
                    (alpha_value < α and sSNV_supporting_reads >= β) else 0)

            print(
                f'{sample}\tsSNV alt:{sSNV_state}\t{sSNV_supporting_reads}\tsSNV ref:{ref_sSNV_reads}\t{ref_sSNV_reads}\tα:{alpha_value}\t{"votes" if vote else "no vote"}'
            )

            if vote:
                votes_for_this_sample.add(sSNV_state)
                sSNV_votes[sSNV_state] += 1
                total_samples_which_voted += 1

    # done voting.
    # the most probable variant base is at least 90% voted for (lambda parameter)
    # and at least ε cells need to vote for it
    statistics[(
        chromosome,
        ssnv_position)]['total_samples_voted'] = total_samples_which_voted

    if total_samples_which_voted < ε:
        # We don't have enough votes
        print(f'Not enough votes {total_samples_which_voted} < ε:{ε}')
        return
    else:
        print(f'Enough votes {total_samples_which_voted} >= ε:{ε}')

    sSNV_alt_base, sSNV_alt_obs = sSNV_votes.most_common()[0]
    statistics[(chromosome, ssnv_position)]['sSNV_alt_vote_ratio'] = (
        sSNV_alt_obs / total_samples_which_voted)
    if (sSNV_alt_obs / total_samples_which_voted) < γ:
        # The ratio of votes is below threshold
        print(
            f'sSNV alt is {sSNV_alt_base}, ratio threshold γ:{γ} , not met with {sSNV_alt_obs / total_samples_which_voted}'
        )
        return

    print(
        f'sSNV alt is {sSNV_alt_base}, γ: {sSNV_alt_obs / total_samples_which_voted} >= {γ}'
    )

    ### Here the "Stats" part of Conbase ends ###
    #############################################

    # Now we determined the sSNV alt base,
    # now determine the linked gSNV
    gSNV_alt_base = None  # Lazy not defined before
    for basecall, obs in gSNV_obs_phased.most_common():
        if basecall != gSNV_ref_base:
            gSNV_alt_base = basecall
            break

    if sSNV_alt_base is None or gSNV_alt_base is None:
        # No phased alt base found ...
        print(f'No phased allele found')
        return

    # Determine the phase (most common genotypes)
    sSNV_phase = None
    wt_allele_gSNV = None
    snv_allele_gSNV = None
    sSNV_phased_votes = sum(
        (obs for (sSNV_state,
                  gSNV_state), obs in complete_genotype_obs.most_common()
         if sSNV_state == sSNV_alt_base and gSNV_state is not None))

    if sSNV_phased_votes == 0:
        print('No votes cast for phasing the selected alternative allele')
        return
    # Find the phased germline variant:
    for (sSNV_state,
         gSNV_state), this_phase_obs in complete_genotype_obs.most_common():
        if sSNV_state != sSNV_alt_base or gSNV_state is None:
            continue

        print(
            f'There are {sSNV_phased_votes} votes for the haplotype {sSNV_state} {gSNV_state}, ratio:{this_phase_obs / sSNV_phased_votes} '
        )

        if (this_phase_obs / sSNV_phased_votes) < ω:
            print(f'This does not pass the threshold ω {ω} ')
            return
        else:
            print(f'This does pass the threshold ω {ω} ')
        break

    sSNV_phase = (sSNV_state, gSNV_state)
    phased_gSNV = gSNV_state
    snv_allele_gSNV = None
    if gSNV_state == gSNV_ref_base:
        # the reference allele is alt
        wt_allele_gSNV = gSNV_alt_base
        snv_allele_gSNV = gSNV_ref_base
    else:
        wt_allele_gSNV = gSNV_ref_base
        snv_allele_gSNV = gSNV_alt_base
        # the reference allele is ref
        # break ? why?

    statistics[(chromosome,
                ssnv_position)]['sSNV_gSNV_phase'] = snv_allele_gSNV

    if snv_allele_gSNV is None:
        print("No germline variant was phased")
        return

    # The valid tuples are thus:
    uninformative_allele = (sSNV_ref_base, wt_allele_gSNV)
    informative_allele_wt = (sSNV_ref_base, snv_allele_gSNV)

    valid_tuples = [
        sSNV_phase,  # mutated
        informative_allele_wt,  # wt
        uninformative_allele
    ]

    # As we have umi's we just have a threshold for the least amount of reads
    # we want to observe for a molecule to be taken into account
    # Count how often we found valid and invalid genotypes
    valid = 0
    invalid = 0
    valid_var = 0
    invalid_var = 0
    for (ssnv, gsnv), tuple_obs in complete_genotype_obs.most_common():
        if ssnv == sSNV_alt_base:  # variant:
            if (ssnv, gsnv) in valid_tuples:
                valid_var += tuple_obs
            else:
                invalid_var += tuple_obs

        if (ssnv, gsnv) in valid_tuples:
            valid += tuple_obs
        else:
            invalid += tuple_obs

    phase_ratio = 0
    if valid_var + invalid_var > 0:
        phase_ratio = valid_var / (valid_var + invalid_var)

    # Score Tuples with evidence for variant
    haplotype_scores[(chrom, pos)] = {
        'valid_tuples':
        valid,
        'invalid_tuples':
        invalid,
        'valid_var_tuples':
        valid_var,
        'invalid_var_tuples':
        invalid_var,
        'phasing_ratio':
        phase_ratio,
        'gSNV_allelic_bias':
        gSNV_obs[gSNV_ref_base] /
        (gSNV_obs[gSNV_ref_base] + gSNV_obs[gSNV_alt_base])
    }

    print(f'Germline variant obs: {gSNV_ref_base} {gSNV_alt_base}')
    print(f'sSNV obs: {sSNV_ref_base} {sSNV_alt_base}')
    if sSNV_phase is not None:
        print(f'sSNV variant is phased with {phased_gSNV}')
    print(Style.BRIGHT + 'Valid tuples:' + Style.RESET_ALL)
    for g, s in valid_tuples:
        print(f' {g}\t{s}')

    print(Style.BRIGHT + 'Scores:' + Style.RESET_ALL)
    for name, obs in haplotype_scores[(chrom, pos)].items():
        print(f' {name}\t{obs}')

    # Create the cell call dictionary

    uninformative_obs = 0  # This is important, otherwise we might use a SNP ..
    for cell, observed_tuples in cell_read_obs.items():
        print(cell, observed_tuples)
        total_reads = 0
        phased_variant_support_reads = 0
        unphased_variant_support_reads = 0
        variant_neg_support_reads = 0
        uninformative_reads = 0
        conflict_reads = 0
        for (sSNV_state, gSNV_state), reads in observed_tuples.items():
            if sSNV_state is None:
                continue
            total_reads += reads
            if sSNV_state == sSNV_alt_base:
                if gSNV_state == wt_allele_gSNV:
                    conflict_reads += reads
                elif gSNV_state == snv_allele_gSNV:
                    # reads containing the sSNV and gSNV as expected
                    phased_variant_support_reads += reads
                elif gSNV_state is None:
                    # reads containing sSNV but not overlapping with gSNV
                    unphased_variant_support_reads += reads

            elif sSNV_state == sSNV_ref_base:
                if gSNV_state == snv_allele_gSNV:
                    # reads on informative allele where we found evidence
                    # of the sSNV not being present
                    variant_neg_support_reads += reads
                elif gSNV_state == wt_allele_gSNV:
                    uninformative_reads += reads
                    uninformative_obs += 1

        if total_reads == 0:
            continue
        if variant_neg_support_reads > 0:
            cell_call_data[(chrom, pos)][cell] = 0

        if (unphased_variant_support_reads +
                phased_variant_support_reads) / total_reads > 0.1:
            cell_call_data[(chrom, pos)][cell] = 1
            if unphased_variant_support_reads + phased_variant_support_reads >= 3:
                cell_call_data[(chrom, pos)][cell] = 10

        if (phased_variant_support_reads) / total_reads > 0.1:
            cell_call_data[(chrom, pos)][cell] = 2
            if phased_variant_support_reads >= 3:
                cell_call_data[(chrom, pos)][cell] = 20

        #if uninformative_reads / total_reads > 0.1:
        #    # 0.1 for ref allele obs
    #        cell_call_data[(chrom, pos)][cell] += 0.1

        if conflict_reads / (total_reads) > 0.2:
            cell_call_data[(chrom, pos)][cell] = -1  # invalid

    haplotype_scores[(chrom, pos)]['uninformative_obs'] = uninformative_obs
    # Annotate every molecule...

    for molecule_id, (m, ssnv_state,
                      gsnv_state) in enumerate(window_molecules):
        m.set_meta('mi', molecule_id)
        if gsnv_state is None:
            m.set_meta('gv', '?')
        else:
            m.set_meta('gv', gsnv_state)

        if ssnv_state is None:
            m.set_meta('sv', '?')
        else:
            m.set_meta('sv', ssnv_state)

        if ssnv_state is None:
            m.set_meta('VD', 'NO_SNV_OVERLAP')
            continue

        if gsnv_state is not None and not (ssnv_state,
                                           gsnv_state) in valid_tuples:
            m.set_meta('VD', 'INVALID_PHASE')
            continue
        if ssnv_state == sSNV_alt_base:
            m.set_meta('VD', 'SNV_ALT')
            continue

        if ssnv_state == sSNV_ref_base and gsnv_state == phased_gSNV:
            m.set_meta('VD', 'SNV_REF')
            continue
        if gsnv_state != phased_gSNV:
            m.set_meta('VD', 'UNINFORMATIVE_ALLELE')
            continue

        m.set_meta('VD', 'REJECTED')
    # write

    for m, ssnv_state, gsnv_state in window_molecules:
        m.write_tags()
        m.write_pysam(out)

        # Update read groups
        for fragment in m:
            read_groups.add(fragment.get_read_group())