likely_stutter += nreads[1] return array([ sum(nreads), likely_stutter, 1, 1 if likely_stutter else 0 ]) if __name__ == "__main__": parser = ArgumentParser() parser.add_argument('--error-distn-file', metavar='file', type=str, required=True, help="File to which STR polymorphism error rates will be written.") parser.add_argument('--filter-metrics-file', metavar='file', type=str, help='File to store metrics related to locus and read filtering.') parser.add_argument('--single-cell', action='store_true', default=False, help="Library was generated from a single cell. Disables the " \ "binomial model for >1 primary alleles.") STRLocusIterator.add_parser_args(parser) args = parser.parse_args() # Many of the command line args are STRLocusWalker parameters lw_params = dict(vars(args)) del(lw_params['error_distn_file']) del(lw_params['single_cell']) del(lw_params['filter_metrics_file']) # Step 1. Generate an STR length polymorphism error profile and save it. errors = profile_error_distn(lw_params, is_single_cell=args.single_cell) save_error_distn(args.error_distn_file, errors) # Step 2. Now that we have an empirical distribution of STR polymorphism # error rates, genotype the loci. The results are printed to stdout. genotype(lw_params, args.filter_metrics_file, errors)
should be haploid and thus deviations from expectation should reflect experimental errors. * maximum mapQ option: --mapq60, only use mapq60 (very high confidence) alignments * minimum repeat unit option: --min-unit (recommended=3, default=1?). Only consider loci where the repeat unit (end-start+1) / unit size is greater than the specified value. This could be useful to remove questionable loci, like 2~3 repeat units of tri or tetranucleotide repeats. """ import sys from argparse import ArgumentParser from strlocusiterator import STRLocusIterator parser = ArgumentParser() STRLocusIterator.add_parser_args(parser) args = parser.parse_args() locus_f = STRLocusIterator(**vars(args)) for (chrom, start, end, unit, region, reads) in locus_f: # Don't do anything, just accumulate metrics continue for (description, value) in locus_f.filter_metrics(): print("%s\t%d" % (description, value)) for (description, hist) in locus_f.hist_metrics(): print(description) for k in sorted(hist.keys()): print("%s\t%d" % (k, hist[k]))