def test_rt_reaction_counting_HAMMING_0(self): # This is a dictionary containing molecules and the amount of rt reactions for location chr1:164834865 f = pysam.AlignmentFile('./data/mini_nla_test.bam') it = singlecellmultiomics.molecule.MoleculeIterator( alignments=f, molecule_class=singlecellmultiomics.molecule.Molecule, fragment_class=singlecellmultiomics.fragment.NlaIIIFragment, fragment_class_args={'umi_hamming_distance': 1}) hand_curated_truth = { # hd: 0: # hard example with N in random primer and N in UMI: 'APKS2-P8-2-2_52': { 'rt_count': 3 }, # Simple examples: 'APKS2-P18-1-1_318': { 'rt_count': 1 }, 'APKS2-P18-1-1_369': { 'rt_count': 2 }, 'APKS2-P18-2-1_66': { 'rt_count': 1 }, 'APKS2-P18-2-1_76': { 'rt_count': 1 }, 'APKS2-P18-2-1_76': { 'rt_count': 1 }, } obtained_rt_count = {} for molecule in it: site = molecule.get_cut_site() if site is not None and site[1] == 164834865: obtained_rt_count[molecule.get_sample()] = len( molecule.get_rt_reaction_fragment_sizes(max_N_distance=0)) # Validate: for sample, truth in hand_curated_truth.items(): self.assertEqual(obtained_rt_count.get(sample, 0), truth.get('rt_count', -1))
def test_get_match_mismatch_frequency(self): """Test if the matches and mismatches of reads in a molecule are counted properly""" with pysam.AlignmentFile('./data/mini_nla_test.bam') as f: it = singlecellmultiomics.molecule.MoleculeIterator( alignments=f, molecule_class=singlecellmultiomics.molecule.Molecule, fragment_class_args={ 'R1_primer_length': 0, 'R2_primer_length': 0, }, fragment_class=singlecellmultiomics.fragment.NlaIIIFragment) for molecule in iter(it): #print(molecule.get_sample()) if molecule.get_sample() == 'APKS3-P19-1-1_91': break #print(molecule.get_match_mismatch_frequency()) self.assertEqual((628, 13), molecule.get_match_mismatch_frequency())
def test_get_consensus_gc_ratio(self): """Test if the gc ratio of a molecule is properly determined""" with pysam.AlignmentFile('./data/mini_nla_test.bam') as f: it = singlecellmultiomics.molecule.MoleculeIterator( alignments=f, molecule_class=singlecellmultiomics.molecule.Molecule, fragment_class=singlecellmultiomics.fragment.Fragment, fragment_class_args={ 'R1_primer_length': 0, 'R2_primer_length': 0, }) for molecule in iter(it): #print(molecule.get_sample()) if molecule.get_sample() == 'APKS3-P19-1-1_91': break self.assertAlmostEqual(0.23113207547169812, molecule.get_consensus_gc_ratio())
def _pool_test(self, pooling_method=0, hd=0): for sample in ['AP1-P22-1-1_318', 'APKS2-P8-2-2_52']: f = pysam.AlignmentFile('./data/mini_nla_test.bam') it = singlecellmultiomics.molecule.MoleculeIterator( alignments=f, molecule_class=singlecellmultiomics.molecule.Molecule, fragment_class=singlecellmultiomics.fragment.NlaIIIFragment, fragment_class_args={'umi_hamming_distance': hd}, pooling_method=pooling_method) molecule_count = 0 for molecule in iter(it): if molecule.get_sample() == sample: molecule_count += 1 if hd == 0: # it has one fragment with a sequencing error in the UMI self.assertEqual(molecule_count, 2) else: self.assertEqual(molecule_count, 1)
def test_consensus(self): """Test if a right consensus sequence can be produced from a noisy molecule""" with pysam.AlignmentFile('./data/mini_nla_test.bam') as f: it = singlecellmultiomics.molecule.MoleculeIterator( alignments=f, molecule_class=singlecellmultiomics.molecule.Molecule, fragment_class=singlecellmultiomics.fragment.NlaIIIFragment, fragment_class_args={ 'R1_primer_length': 0, 'R2_primer_length': 6, }) for molecule in iter(it): #print(molecule.get_sample()) if molecule.get_sample() == 'APKS3-P19-1-1_91': break self.assertEqual( ''.join(list(molecule.get_consensus().values())), 'CATGAGTTAGATATGGACTCTTCTTCAGACACTTTGTTTAAATTTTAAATTTTTTTCTGATTGCATATTACTAAAAATGTGTTATGAATATTTTCCATATCATTAAACATTCTTCTCAAGCATAACTTTAAATAACTGCATTATAGAAAATTTACGCTACTTTTGTTTTTGTTTTTTTTTTTTTTTTTTTACTATTATTAATAACACGGTGG' )
def count_transcripts(cargs): args, contig = cargs if args.alleles is not None: allele_resolver = alleleTools.AlleleResolver( args.alleles, lazyLoad=(not args.loadAllelesToMem)) else: allele_resolver = None contig_mapping = None if args.contigmapping == 'danio': contig_mapping = { '1': 'CM002885.2', '2': 'CM002886.2', '3': 'CM002887.2', '4': 'CM002888.2', '5': 'CM002889.2', '6': 'CM002890.2', '7': 'CM002891.2', '8': 'CM002892.2', '9': 'CM002893.2', '10': 'CM002894.2', '11': 'CM002895.2', '12': 'CM002896.2', '13': 'CM002897.2', '14': 'CM002898.2', '15': 'CM002899.2', '16': 'CM002900.2', '17': 'CM002901.2', '18': 'CM002902.2', '19': 'CM002903.2', '20': 'CM002904.2', '21': 'CM002905.2', '22': 'CM002906.2', '23': 'CM002907.2', '24': 'CM002908.2', '25': 'CM002909.2', } # Load features contig_mapping = None #conversion_table = get_gene_id_to_gene_name_conversion_table(args.gtfexon) features = singlecellmultiomics.features.FeatureContainer() if contig_mapping is not None: features.remapKeys = contig_mapping features.loadGTF( args.gtfexon, select_feature_type=['exon'], identifierFields=( 'exon_id', 'transcript_id'), store_all=True, head=args.hf, contig=contig) features.loadGTF( args.gtfintron, select_feature_type=['intron'], identifierFields=['transcript_id'], store_all=True, head=args.hf, contig=contig) # What is used for assignment of molecules? if args.method == 'nla': moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment pooling_method = 1 # all data from the same cell can be dealt with separately stranded = None # data is not stranded elif args.method == 'vasa' or args.method == 'cs': moleculeClass = singlecellmultiomics.molecule.VASA fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript pooling_method = 1 stranded = 1 # data is stranded, mapping to other strand else: raise ValueError("Supply a valid method") # COUNT: exon_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount intron_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount junction_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount gene_counts_per_cell = collections.defaultdict( collections.Counter) # cell->gene->umiCount gene_set = set() sample_set = set() annotated_molecules = 0 read_molecules = 0 if args.producebam: bam_path_produced = f'{args.o}/output_bam_{contig}.unsorted.bam' with pysam.AlignmentFile(args.alignmentfiles[0]) as alignments: output_bam = pysam.AlignmentFile( bam_path_produced, "wb", header=alignments.header) ref = None if args.ref is not None: ref = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref)) for alignmentfile_path in args.alignmentfiles: i = 0 with pysam.AlignmentFile(alignmentfile_path) as alignments: molecule_iterator = MoleculeIterator( alignments=alignments, check_eject_every=5000, moleculeClass=moleculeClass, molecule_class_args={ 'features': features, 'stranded': stranded, 'min_max_mapping_quality': args.minmq, 'reference': ref, 'allele_resolver': allele_resolver }, fragmentClass=fragmentClass, fragment_class_args={ 'umi_hamming_distance': args.umi_hamming_distance, 'R1_primer_length': 4, 'R2_primer_length': 6}, perform_qflag=True, # when the reads have not been tagged yet, this flag is very # much required pooling_method=pooling_method, contig=contig ) for i, molecule in enumerate(molecule_iterator): if not molecule.is_valid(): if args.producebam: molecule.write_tags() molecule.write_pysam(output_bam) continue molecule.annotate(args.annotmethod) molecule.set_intron_exon_features() if args.producebam: molecule.write_tags() molecule.write_pysam(output_bam) allele = None if allele_resolver is not None: allele = molecule.allele if allele is None: allele = 'noAllele' # Obtain total count introns/exons reduce it so the sum of the # count will be 1: # len(molecule.introns.union( molecule.exons).difference(molecule.junctions))+len(molecule.junctions) total_count_for_molecule = len(molecule.genes) if total_count_for_molecule == 0: continue # we didn't find any gene counts # Distibute count over amount of gene hits: count_to_add = 1 / total_count_for_molecule for gene in molecule.genes: if allele is not None: gene = f'{allele}_{gene}' gene_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) sample_set.add(molecule.get_sample()) # Obtain introns/exons/splice junction information: for intron in molecule.introns: gene = intron if allele is not None: gene = f'{allele}_{intron}' intron_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) for exon in molecule.exons: gene = exon if allele is not None: gene = f'{allele}_{exon}' exon_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) for junction in molecule.junctions: gene = junction if allele is not None: gene = f'{allele}_{junction}' junction_counts_per_cell[molecule.sample][gene] += count_to_add gene_set.add(gene) annotated_molecules += 1 if args.head and (i + 1) > args.head: print( f"-head was supplied, {i} molecules discovered, stopping") break read_molecules += i if args.producebam: output_bam.close() final_bam_path = bam_path_produced.replace('.unsorted', '') sort_and_index(bam_path_produced, final_bam_path, remove_unsorted=True) return ( gene_set, sample_set, gene_counts_per_cell, junction_counts_per_cell, exon_counts_per_cell, intron_counts_per_cell, annotated_molecules, read_molecules, contig )
genomeString.append(call['reference_base']) else: unmethylated_hits += 1 # NEED TO REMOVE THIS CODE ENTIRELY!! <- VB if args.table is not None: for binIdx in singlecellmultiomics.utils.coordinate_to_bins( location, args.bin_size, args.sliding_increment): bin_start, bin_end = binIdx if bin_start < 0 or bin_end > ref_lengths[ molecule.chromosome]: continue if args.stranded: binned_data[(chromosome, molecule.get_strand_repr(), binIdx)][molecule.get_sample()][ call['context'].isupper()] += 1 cell_count[molecule.get_sample()] += 1 else: binned_data[(chromosome, binIdx)][molecule.get_sample()][ call['context'].isupper()] += 1 cell_count[molecule.get_sample()] += 1 ### if args.bed is not None: # Skip non-selected contexts only for table if contexts is not None and call['context'] not in contexts: continue else: # name = cell barcode + context name = ":".join(
def obtain_variant_statistics(alignment_file_paths, cell_obs, statistics, cell_call_data, reference, chromosome, ssnv_position, gsnv_position, haplotype_scores, WINDOW_RADIUS, out, min_read_obs, read_groups, umi_hamming_distance, args): """ Obtain statistics from known gsnv-phased variant location Args: alignment_file_paths (list) : List of handles to Pysam.Alignment files from which to extract molecules cell_obs ( collections.defaultdict(lambda: collections.defaultdict( collections.Counter) ) ) statistics ( collections.defaultdict(lambda: collections.defaultdict( collections.Counter) ) ) cell_call_data(collections.defaultdict(dict) ) haplotype_scores(dict) reference (pysamiterators.CachedFasta) chromosome (str) ssnv_position (int) : zero based ssnv position gsnv_position (int) : zero based gsnv position WINDOW_RADIUS(int) out(pysam.AlignmentFile ) min_read_obs(int) read_groups(set) umi_hamming_distance(int) args """ sSNV_ref_base = reference.fetch(chromosome, ssnv_position, ssnv_position + 1) gSNV_ref_base = reference.fetch(chromosome, gsnv_position, gsnv_position + 1) window_molecules = [] cell_read_obs = collections.defaultdict( collections.Counter) # sample -> tuple -> amount of reads region_start = ssnv_position - WINDOW_RADIUS region_end = ssnv_position + WINDOW_RADIUS for pathi, alignment_path in enumerate(alignment_file_paths): # Perform re-alignment: if args.realign: target_bam = f'align_{chromosome}_{region_start}_{region_end}.bam' if not os.path.exists(target_bam): temp_target_bam = f'{target_bam}.temp.bam' temp_target_bai = f'{target_bam}.temp.bai' GATK_indel_realign( alignment_path, temp_target_bam, chromosome, region_start, region_end, args.indelvcf, gatk_path=args.gatk3_path, interval_path=None, java_cmd= f'java -jar -Xmx{args.realign_mem}G -Djava.io.tmpdir=./gatk_tmp', reference=reference.handle.filename.decode('utf8'), interval_write_path= f'./align_{chromosome}_{region_start}_{region_end}.intervals' ) print(f'Renaming {temp_target_bam} > {target_bam}') os.rename(temp_target_bam, target_bam) os.rename(temp_target_bai, target_bam.replace('.bam', '.bai')) alignment_path = target_bam with pysam.AlignmentFile( alignment_path, ignore_truncation=args.ignore_bam_issues) as alignments: for molecule_id, molecule in enumerate( singlecellmultiomics.molecule.MoleculeIterator( alignments, fragment_class_args={ 'umi_hamming_distance': umi_hamming_distance, }, molecule_class_args={'reference': reference}, moleculeClass=singlecellmultiomics.molecule. NlaIIIMolecule, fragmentClass=singlecellmultiomics.fragment. NLAIIIFragment, start=ssnv_position - WINDOW_RADIUS, end=ssnv_position + WINDOW_RADIUS, contig=chromosome)): # For every molecule obtain the consensus from which to extract # the gSNV and sSNV: try: consensus = molecule.get_consensus() except Exception as e: if str( e ) == 'Could not extract any safe data from molecule': statistics[(chromosome, ssnv_position)]['R2_unmapped'][True] += 1 else: print(e) continue # Extract the gSNV and sSNV: ssnv_state = consensus.get((chromosome, ssnv_position)) gsnv_state = consensus.get((chromosome, gsnv_position)) # Store all used molecules in the window for inspection: window_molecules.append((molecule, ssnv_state, gsnv_state)) # If both the ssnv and gsnv are none there is no information we # can use. if ssnv_state is None and gsnv_state is None: continue # Store the observation # the amount of reads of evidence is len(molecule) cell_obs[(chromosome, ssnv_position)][molecule.get_sample()][( ssnv_state, gsnv_state)] += 1 cell_read_obs[molecule.get_sample()][( ssnv_state, gsnv_state)] += len(molecule) # Store statistics statistics[(chromosome, ssnv_position)]['max_mapping_quality'][ molecule.get_max_mapping_qual()] += 1 statistics[(chromosome, ssnv_position)]['fragment_size'][ molecule.get_safely_aligned_length()] += 1 statistics[(chromosome, ssnv_position)]['ivt_dups'][len( molecule.get_rt_reactions())] += 1 statistics[(chromosome, ssnv_position)]['undigested'][ molecule.get_undigested_site_count()] += 1 statistics[(chromosome, ssnv_position)]['reads'][len(molecule)] += 1 statistics[(chromosome, ssnv_position)]['molecules'][1] += 1 # Store alignment statistics: for operation, per_bp in molecule.get_alignment_stats().items( ): statistics[(chromosome, ssnv_position)][operation][per_bp] += 1 try: statistics[(chromosome, ssnv_position)]['ssnv_ref_phred'][ molecule.get_mean_base_quality(chromosome, ssnv_position, sSNV_ref_base)] += 1 except BaseException: pass try: statistics[(chromosome, ssnv_position)]['ssnv_alt_phred'][ molecule.get_mean_base_quality( chromosome, ssnv_position, not_base=sSNV_ref_base)] += 1 except BaseException: pass try: statistics[(chromosome, ssnv_position)]['gsnv_ref_phred'][ molecule.get_mean_base_quality(chromosome, gsnv_position, gSNV_ref_base)] += 1 except BaseException: pass try: statistics[(chromosome, ssnv_position)]['gsnv_any_alt_phred'][ molecule.get_mean_base_quality( chromosome, gsnv_position, not_base=gSNV_ref_base)] += 1 except BaseException: pass # After finishing iteration over all molecules assign genotypes chrom, pos = chromosome, ssnv_position obs_for_cells = cell_obs[(chrom, pos)] sSNV_alt_base = None gSNV_alt_base = None genotype_obs = collections.Counter() complete_genotype_obs = collections.Counter() sSNV_obs_phased = collections.Counter() gSNV_obs_phased = collections.Counter() sSNV_obs = collections.Counter() gSNV_obs = collections.Counter() for cell, cell_data in obs_for_cells.items(): for ssnv, gsnv in cell_data: genotype_obs[(ssnv, gsnv)] += 1 gSNV_obs[gsnv] += 1 sSNV_obs[ssnv] += 1 if ssnv is not None and gsnv is not None: complete_genotype_obs[(ssnv, gsnv)] += 1 # Only count these when the germline variant is detected gSNV_obs_phased[gsnv] += 1 sSNV_obs_phased[ssnv] += 1 print(Style.BRIGHT + f'Genotype observations for variant {chrom}:{pos}' + Style.RESET_ALL) print('som\tgerm\tobs') for (ssnv, gsnv), obs in complete_genotype_obs.most_common(): print(f' {ssnv}\t{gsnv}\t{obs}') if len(complete_genotype_obs) <= 2: print(f'not enough genotype observations for a variant call (<=2)') ### Conbase algorithm : ### # # determine if there is an alternative base in the first place # a fraction of the reads in a cell need to vote for a tuple, # this fraction is stored in the alpha parameter , or a minimum amount of reads, stored in the beta parameter # determine tp*, the alleles we expect observe # ϴ τ α γ κ λ ν ξ ρ ϕ α = 0.2 # minimum relative abundance of sSNV voting reads in single sample β = 3 # minimum amount of sSSNV reads in cell, or in total if α is exceeded γ = 0.9 # minimum amount of votes for sSNV ε = 2 # minimum amount of cells voting for sSNV ω = 0.9 # gsnv majority sSNV_votes = collections.Counter() # { sSNV_alt_base : votes } total_samples_which_voted = 0 for sample, observed_tuples in cell_read_obs.items(): # First we create a Counter just counting the amount of evidence per # base for this sample : evidence_total_reads = collections.Counter() total_reads = 0 for (sSNV_state, gSNV_state), reads in observed_tuples.most_common(): if sSNV_state is None: continue evidence_total_reads[sSNV_state] += reads total_reads += reads # this is the amount of reads which contain evidence for the reference # base ref_sSNV_reads = evidence_total_reads[sSNV_ref_base] votes_for_this_sample = set( ) # the alternative bases this sample votes for for sSNV_state, sSNV_supporting_reads in evidence_total_reads.most_common( ): # The reference base does not vote. if sSNV_state == sSNV_ref_base or sSNV_state is None: continue # check if at least alpha reads vote for the sSNV alpha_value = 0 if ref_sSNV_reads == 0 else sSNV_supporting_reads / ref_sSNV_reads vote = (1 if (alpha_value >= α and (sSNV_supporting_reads + ref_sSNV_reads) >= β) or (alpha_value < α and sSNV_supporting_reads >= β) else 0) print( f'{sample}\tsSNV alt:{sSNV_state}\t{sSNV_supporting_reads}\tsSNV ref:{ref_sSNV_reads}\t{ref_sSNV_reads}\tα:{alpha_value}\t{"votes" if vote else "no vote"}' ) if vote: votes_for_this_sample.add(sSNV_state) sSNV_votes[sSNV_state] += 1 total_samples_which_voted += 1 # done voting. # the most probable variant base is at least 90% voted for (lambda parameter) # and at least ε cells need to vote for it statistics[( chromosome, ssnv_position)]['total_samples_voted'] = total_samples_which_voted if total_samples_which_voted < ε: # We don't have enough votes print(f'Not enough votes {total_samples_which_voted} < ε:{ε}') return else: print(f'Enough votes {total_samples_which_voted} >= ε:{ε}') sSNV_alt_base, sSNV_alt_obs = sSNV_votes.most_common()[0] statistics[(chromosome, ssnv_position)]['sSNV_alt_vote_ratio'] = ( sSNV_alt_obs / total_samples_which_voted) if (sSNV_alt_obs / total_samples_which_voted) < γ: # The ratio of votes is below threshold print( f'sSNV alt is {sSNV_alt_base}, ratio threshold γ:{γ} , not met with {sSNV_alt_obs / total_samples_which_voted}' ) return print( f'sSNV alt is {sSNV_alt_base}, γ: {sSNV_alt_obs / total_samples_which_voted} >= {γ}' ) ### Here the "Stats" part of Conbase ends ### ############################################# # Now we determined the sSNV alt base, # now determine the linked gSNV gSNV_alt_base = None # Lazy not defined before for basecall, obs in gSNV_obs_phased.most_common(): if basecall != gSNV_ref_base: gSNV_alt_base = basecall break if sSNV_alt_base is None or gSNV_alt_base is None: # No phased alt base found ... print(f'No phased allele found') return # Determine the phase (most common genotypes) sSNV_phase = None wt_allele_gSNV = None snv_allele_gSNV = None sSNV_phased_votes = sum( (obs for (sSNV_state, gSNV_state), obs in complete_genotype_obs.most_common() if sSNV_state == sSNV_alt_base and gSNV_state is not None)) if sSNV_phased_votes == 0: print('No votes cast for phasing the selected alternative allele') return # Find the phased germline variant: for (sSNV_state, gSNV_state), this_phase_obs in complete_genotype_obs.most_common(): if sSNV_state != sSNV_alt_base or gSNV_state is None: continue print( f'There are {sSNV_phased_votes} votes for the haplotype {sSNV_state} {gSNV_state}, ratio:{this_phase_obs / sSNV_phased_votes} ' ) if (this_phase_obs / sSNV_phased_votes) < ω: print(f'This does not pass the threshold ω {ω} ') return else: print(f'This does pass the threshold ω {ω} ') break sSNV_phase = (sSNV_state, gSNV_state) phased_gSNV = gSNV_state snv_allele_gSNV = None if gSNV_state == gSNV_ref_base: # the reference allele is alt wt_allele_gSNV = gSNV_alt_base snv_allele_gSNV = gSNV_ref_base else: wt_allele_gSNV = gSNV_ref_base snv_allele_gSNV = gSNV_alt_base # the reference allele is ref # break ? why? statistics[(chromosome, ssnv_position)]['sSNV_gSNV_phase'] = snv_allele_gSNV if snv_allele_gSNV is None: print("No germline variant was phased") return # The valid tuples are thus: uninformative_allele = (sSNV_ref_base, wt_allele_gSNV) informative_allele_wt = (sSNV_ref_base, snv_allele_gSNV) valid_tuples = [ sSNV_phase, # mutated informative_allele_wt, # wt uninformative_allele ] # As we have umi's we just have a threshold for the least amount of reads # we want to observe for a molecule to be taken into account # Count how often we found valid and invalid genotypes valid = 0 invalid = 0 valid_var = 0 invalid_var = 0 for (ssnv, gsnv), tuple_obs in complete_genotype_obs.most_common(): if ssnv == sSNV_alt_base: # variant: if (ssnv, gsnv) in valid_tuples: valid_var += tuple_obs else: invalid_var += tuple_obs if (ssnv, gsnv) in valid_tuples: valid += tuple_obs else: invalid += tuple_obs phase_ratio = 0 if valid_var + invalid_var > 0: phase_ratio = valid_var / (valid_var + invalid_var) # Score Tuples with evidence for variant haplotype_scores[(chrom, pos)] = { 'valid_tuples': valid, 'invalid_tuples': invalid, 'valid_var_tuples': valid_var, 'invalid_var_tuples': invalid_var, 'phasing_ratio': phase_ratio, 'gSNV_allelic_bias': gSNV_obs[gSNV_ref_base] / (gSNV_obs[gSNV_ref_base] + gSNV_obs[gSNV_alt_base]) } print(f'Germline variant obs: {gSNV_ref_base} {gSNV_alt_base}') print(f'sSNV obs: {sSNV_ref_base} {sSNV_alt_base}') if sSNV_phase is not None: print(f'sSNV variant is phased with {phased_gSNV}') print(Style.BRIGHT + 'Valid tuples:' + Style.RESET_ALL) for g, s in valid_tuples: print(f' {g}\t{s}') print(Style.BRIGHT + 'Scores:' + Style.RESET_ALL) for name, obs in haplotype_scores[(chrom, pos)].items(): print(f' {name}\t{obs}') # Create the cell call dictionary uninformative_obs = 0 # This is important, otherwise we might use a SNP .. for cell, observed_tuples in cell_read_obs.items(): print(cell, observed_tuples) total_reads = 0 phased_variant_support_reads = 0 unphased_variant_support_reads = 0 variant_neg_support_reads = 0 uninformative_reads = 0 conflict_reads = 0 for (sSNV_state, gSNV_state), reads in observed_tuples.items(): if sSNV_state is None: continue total_reads += reads if sSNV_state == sSNV_alt_base: if gSNV_state == wt_allele_gSNV: conflict_reads += reads elif gSNV_state == snv_allele_gSNV: # reads containing the sSNV and gSNV as expected phased_variant_support_reads += reads elif gSNV_state is None: # reads containing sSNV but not overlapping with gSNV unphased_variant_support_reads += reads elif sSNV_state == sSNV_ref_base: if gSNV_state == snv_allele_gSNV: # reads on informative allele where we found evidence # of the sSNV not being present variant_neg_support_reads += reads elif gSNV_state == wt_allele_gSNV: uninformative_reads += reads uninformative_obs += 1 if total_reads == 0: continue if variant_neg_support_reads > 0: cell_call_data[(chrom, pos)][cell] = 0 if (unphased_variant_support_reads + phased_variant_support_reads) / total_reads > 0.1: cell_call_data[(chrom, pos)][cell] = 1 if unphased_variant_support_reads + phased_variant_support_reads >= 3: cell_call_data[(chrom, pos)][cell] = 10 if (phased_variant_support_reads) / total_reads > 0.1: cell_call_data[(chrom, pos)][cell] = 2 if phased_variant_support_reads >= 3: cell_call_data[(chrom, pos)][cell] = 20 #if uninformative_reads / total_reads > 0.1: # # 0.1 for ref allele obs # cell_call_data[(chrom, pos)][cell] += 0.1 if conflict_reads / (total_reads) > 0.2: cell_call_data[(chrom, pos)][cell] = -1 # invalid haplotype_scores[(chrom, pos)]['uninformative_obs'] = uninformative_obs # Annotate every molecule... for molecule_id, (m, ssnv_state, gsnv_state) in enumerate(window_molecules): m.set_meta('mi', molecule_id) if gsnv_state is None: m.set_meta('gv', '?') else: m.set_meta('gv', gsnv_state) if ssnv_state is None: m.set_meta('sv', '?') else: m.set_meta('sv', ssnv_state) if ssnv_state is None: m.set_meta('VD', 'NO_SNV_OVERLAP') continue if gsnv_state is not None and not (ssnv_state, gsnv_state) in valid_tuples: m.set_meta('VD', 'INVALID_PHASE') continue if ssnv_state == sSNV_alt_base: m.set_meta('VD', 'SNV_ALT') continue if ssnv_state == sSNV_ref_base and gsnv_state == phased_gSNV: m.set_meta('VD', 'SNV_REF') continue if gsnv_state != phased_gSNV: m.set_meta('VD', 'UNINFORMATIVE_ALLELE') continue m.set_meta('VD', 'REJECTED') # write for m, ssnv_state, gsnv_state in window_molecules: m.write_tags() m.write_pysam(out) # Update read groups for fragment in m: read_groups.add(fragment.get_read_group())