pos, read_details = ht.pysam_to_hdf(bam_paths[0]) if not bam_input and config.getboolean('behavior', 'deletebam'): os.remove(bam_paths[0]) binary = np.sign(pos) # dtype=np.uint16 # dimensionality reduction and typing alleles_to_keep = list(filter(is_frequent, binary.columns)) binary = binary[alleles_to_keep] if VERBOSE: print("\n", ht.now(), 'temporary pruning of identical rows and columns') unique_col, representing = ht.prune_identical_alleles(binary, report_groups=True) representing_df = pd.DataFrame([[a1, a2] for a1, a_l in representing.items() for a2 in a_l], columns=['representative', 'represented']) temp_pruned = ht.prune_identical_reads(unique_col) if VERBOSE: print("\n", ht.now(), 'Size of mtx with unique rows and columns:', temp_pruned.shape) print(ht.now(), 'determining minimal set of non-overshadowed alleles') minimal_alleles = ht.prune_overshadowed_alleles(temp_pruned) if VERBOSE:
else: print "\nCould not match paired-end pairs. Switching to single-end pipeline." binary = binary1 is_paired = False else: pos, etc, desc = ht.sam_to_hdf(out_dir+"/"+date+"_0.sam", verbosity=args.verbose) binary = pos.applymap(bool).applymap(int) #dimensionality reduction and typing alleles_to_keep = filter(is_frequent, binary.columns) binary = binary[alleles_to_keep] if args.verbose: print "\n", ht.now(), 'temporary pruning of identical rows and columns' unique_col, representing = ht.prune_identical_alleles(binary, report_groups=True) representing_df = pd.DataFrame([[a1, a2] for a1, a_l in representing.iteritems() for a2 in a_l], columns=['representative', 'represented']) temp_pruned = ht.prune_identical_reads(unique_col) if args.verbose: print "\n", ht.now(), 'Size of mtx with unique rows and columns:', temp_pruned.shape print ht.now(), 'determining minimal set of non-overshadowed alleles' minimal_alleles = ht.prune_overshadowed_alleles(temp_pruned) if args.verbose: print "\n", ht.now(), 'Keeping only the minimal number of required alleles', minimal_alleles.shape binary = binary[minimal_alleles]