class VarbinPipeline(object): def __init__(self, config): self.config = config self.genome = Genome(config) def find_bin_index(self, abspos, bins): index = np.searchsorted(abspos, bins, side='right') index = index - 1 return index def mapping_all_filenames(self): pattern = os.path.join( self.config.mapping.mapping_dir, "*{}".format(self.config.mapping.mapping_suffix)) filenames = glob.glob(pattern) return filenames def find_bin_index_binsearch(self, bins, abspos): index_up = len(bins) index_down = 0 index_mid = int((index_up - index_down) / 2.0) while True: if abspos >= int(bins[index_mid]): index_down = index_mid + 0 index_mid = int((index_up - index_down) / 2.0) + index_mid else: index_up = index_mid + 0 index_mid = int((index_up - index_down) / 2.0) + index_down if index_up - index_down < 2: break return index_down def varbin(self, filename): try: assert os.path.exists(filename), os.path.abspath(filename) infile = pysam.AlignmentFile(filename, 'rb') bins_df = self.genome.bins_boundaries() assert bins_df is not None chrom_sizes = self.genome.chrom_sizes() chroms = set(self.genome.version.CHROMS) count = 0 dups = 0 total_reads = 0 prev_pos = 0 bin_counts = defaultdict(int) bins = bins_df['bin.start.abspos'].values for seg in infile: total_reads += 1 if seg.is_unmapped: continue chrom = seg.reference_name if chrom not in chroms: continue if seg.cigarstring != f'{seg.reference_length}M': print("non exact mapping:", seg, seg.cigarstring) continue assert seg.cigarstring == f'{seg.reference_length}M', \ (seg, seg.cigarstring) abspos = chrom_sizes[chrom].abspos + seg.reference_start if prev_pos == abspos: dups += 1 continue count += 1 index = self.find_bin_index_binsearch(bins, abspos) bin_counts[index] += 1 prev_pos = abspos result = [] for index, row in bins_df.iterrows(): bin_count = bin_counts[index] result.append([ row['bin.chrom'], row['bin.start'], row['bin.start.abspos'], bin_count, ]) df = pd.DataFrame.from_records(result, columns=[ 'chrom', 'chrompos', 'abspos', 'bincount', ]) df.sort_values(by=['abspos'], inplace=True) total_count = df.bincount.sum() total_reads_per_bin = float(total_count) / len(bins_df) df['ratio'] = df.bincount / total_reads_per_bin return df except Exception as ex: traceback.print_exc() raise ex return None def run_once(self, mapping_filename): cellname = self.config.cellname(mapping_filename) outfile = self.config.varbin_filename(cellname) print( colored( "processing cell {}; reading from {}; writing to {}".format( cellname, mapping_filename, outfile), "green")) if os.path.exists(outfile) and not self.config.force: print( colored( "output file {} exists; add --force to overwrite".format( outfile), "red")) else: if not self.config.dry_run: df = self.varbin(mapping_filename) df.to_csv(outfile, index=False, sep='\t') def run(self, dask_client): mapping_filenames = self.mapping_all_filenames() print( colored("processing files: {}".format(mapping_filenames), "green")) if self.config.dry_run: return assert dask_client os.makedirs(self.config.varbin.varbin_dir, exist_ok=True) delayed_tasks = dask_client.map(self.run_once, mapping_filenames) distributed.wait(delayed_tasks)
class BinsPipeline(object): def __init__(self, config): self.config = config self.genome = Genome(self.config) def calc_bins_gc_content(self, chroms, bins_df): result = [] for chrom in chroms: chrom_df = bins_df[bins_df['bin.chrom'] == chrom] gc_df = chrom_df.copy() gc_df.reset_index(inplace=True, drop=True) gc_series = pd.Series(index=gc_df.index) chrom_seq = self.genome.load_chrom(chrom) for index, row in gc_df.iterrows(): start = row['bin.start'] end = row['bin.end'] seq = chrom_seq.seq[start:end] counts = [seq.count(x) for x in ['G', 'C', 'A', 'T']] total_counts = sum(counts) if total_counts == 0: gc = 0.0 else: gc = float(sum(counts[0:2])) / sum(counts) gc_series.iloc[index] = gc gc_df['gc.content'] = gc_series result.append(gc_df) assert len(result) > 0 if len(result) == 1: return result[0] df = pd.concat(result) return df def bins_boundaries_generator(self, chroms, mappable_regions_df): chrom_sizes = self.genome.chrom_sizes() chrom_bins = self.genome.calc_chrom_bins() # if mappable_regions_df is None: # mappable_regions_df = self.load_mappable_regions() for chrom in chroms: chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom] chrom_df = chrom_df.sort_values( by=['chrom', 'start_pos', 'end_pos']) params = BinParams.build(chrom_size=chrom_sizes[chrom], chrom_bin=chrom_bins[chrom]) mappable_bin = None current_excess = 0 bins_count = params.bins_count for row in chrom_df.to_dict(orient="records"): if mappable_bin is None: mappable_bin = MappableBin.from_start(params, start_pos=0) current_excess = mappable_bin.adapt_excess(current_excess) if not mappable_bin.check_extend(row): next_bin = mappable_bin.split_extend(row) bins_count -= 1 if bins_count == 0: # last bin a chromosome mappable_bin.end_pos = chrom_sizes[chrom].size yield mappable_bin if next_bin.is_overfill(): current_excess, mappable_bins = \ next_bin.overfill_split(current_excess) assert len(mappable_bins) > 1 for mb in mappable_bins[:-1]: bins_count -= 1 yield mb mappable_bin = mappable_bins[-1] else: mappable_bin = next_bin current_excess = \ mappable_bin.adapt_excess(current_excess) # print("mappable_bin:", row, mappable_bin) mappable_bin = None def calc_bins_boundaries(self, chroms=None, regions_df=None): if chroms is None: chroms = self.genome.version.CHROMS bin_rows = [] for mbin in self.bins_boundaries_generator(chroms, regions_df): # print("mbin:", mbin) bin_rows.append( (mbin.chrom, mbin.start_pos, mbin.start_abspos, mbin.end_pos, mbin.end_pos - mbin.start_pos, mbin.bin_size)) df = pd.DataFrame.from_records(bin_rows, columns=[ 'bin.chrom', 'bin.start', 'bin.start.abspos', 'bin.end', 'bin.length', 'mappable.positions' ]) df.sort_values(by=['bin.start.abspos'], inplace=True) return df def load_mappable_regions(self, chrom=None): filename = self.config.mappable_regions_filename(chrom=chrom) df = pd.read_csv(self.config.mappable_regions_filename(), names=['chrom', 'start_pos', 'end_pos'], sep='\t') df = df.sort_values(by=['chrom', 'start_pos', 'end_pos']) assert len(df) > 0 return df def run_once(self, chrom): print( colored(f"started calculating bins for chromosome {chrom}", "green")) regions_df = self.load_mappable_regions(chrom=chrom) bins_df = self.calc_bins_boundaries([chrom], regions_df) df = self.calc_bins_gc_content([chrom], bins_df) outfilename = self.config.bins_boundaries_filename(chrom) print( colored(f"saving bins for chromosome {chrom} into {outfilename}", "green")) df.to_csv(outfilename, sep='\t', index=False) return outfilename def concatenate_all_chroms(self): outfilename = self.config.bins_boundaries_filename() if os.path.exists(outfilename) and not self.config.force: print( colored( "destination bins boundaries file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if self.config.dry_run: return dataframes = [] for chrom in self.genome.version.CHROMS: srcfilename = self.config.bins_boundaries_filename(chrom) df = pd.read_csv(srcfilename, sep='\t') dataframes.append(df) outdf = pd.concat(dataframes, ignore_index=True) outdf.sort_values(by=['bin.start.abspos', 'bin.start', 'bin.end'], inplace=True) outdf.to_csv(outfilename, sep='\t', index=False) def run(self, dask_client): outfilename = self.config.bins_boundaries_filename() os.makedirs(os.path.dirname(outfilename), exist_ok=True) print( colored( "going to compute bin boundaries from mappable regions: {} " "into bins boundaries file {}".format( self.config.mappable_regions_filename(), outfilename), "green")) if os.path.exists(outfilename) and not self.config.force: print( colored( "output file {} already exists; " "use --force to overwrite".format(outfilename), "red")) raise ValueError("output file already exists") if self.config.dry_run: return assert self.genome.chrom_sizes() is not None delayed_tasks = dask_client.map(self.run_once, self.genome.version.CHROMS) print(len(delayed_tasks), delayed_tasks) print(dask_client.scheduler_info()) distributed.wait(delayed_tasks) for task in delayed_tasks: outfile = task.result() print(outfile, os.path.exists(outfile)) self.concatenate_all_chroms()
class VarbinPipeline(object): def __init__(self, config): self.config = config self.hg = Genome(config) def find_bin_index(self, abspos, bins): index = np.searchsorted(abspos, bins, side='right') index = index - 1 return index def find_bin_index_binsearch(self, bins, abspos): index_up = len(bins) index_down = 0 index_mid = int((index_up - index_down) / 2.0) while True: if abspos >= int(bins[index_mid]): index_down = index_mid + 0 index_mid = int((index_up - index_down) / 2.0) + index_mid else: index_up = index_mid + 0 index_mid = int((index_up - index_down) / 2.0) + index_down if index_up - index_down < 2: break return index_down def varbin(self, filename): try: assert os.path.exists(filename), os.path.abspath(filename) infile = pysam.AlignmentFile(filename, 'rb') bins_df = self.hg.bins_boundaries() assert bins_df is not None chrom_sizes = self.hg.chrom_sizes() chroms = set(self.hg.version.CHROMS) count = 0 dups = 0 total_reads = 0 prev_pos = 0 bin_counts = defaultdict(int) bins = bins_df['bin.start.abspos'].values for seg in infile: total_reads += 1 if seg.is_unmapped: continue chrom = seg.reference_name if chrom not in chroms: continue abspos = chrom_sizes[chrom].abspos + seg.reference_start if prev_pos == abspos: dups += 1 continue count += 1 index = self.find_bin_index_binsearch(bins, abspos) bin_counts[index] += 1 prev_pos = abspos except Exception: traceback.print_exc() number_of_reads_per_bin = float(count) / len(bins_df) result = [] for index, row in bins_df.iterrows(): bin_count = bin_counts[index] ratio = float(bin_count) / number_of_reads_per_bin result.append([ row['bin.chrom'], row['bin.start'], row['bin.start.abspos'], bin_count, ratio ]) df = pd.DataFrame.from_records(result, columns=[ 'chrom', 'chrompos', 'abspos', 'bincount', 'ratio', ]) df.sort_values(by=['abspos'], inplace=True) return df def run_once(self, mapping_filename): cellname = self.config.cellname(mapping_filename) outfile = self.config.varbin_filename(cellname) print( colored( "processing cell {}; reading from {}; writing to {}".format( cellname, mapping_filename, outfile), "green")) if os.path.exists(outfile) and not self.config.force: print( colored( "output file {} exists; add --force to overwrite".format( outfile), "red")) else: if not self.config.dry_run: df = self.varbin(mapping_filename) df.to_csv(outfile, index=False, sep='\t') def run(self): mapping_filenames = self.config.mapping_filenames() print( colored("processing files: {}".format(mapping_filenames), "green")) pool = multiprocessing.Pool(processes=self.config.parallel) pool.map(self.run_once, mapping_filenames)