def __init__(self, config, aligner=None): self.config = config self.genome = Genome(self.config) if aligner is not None: self.aligner = aligner else: assert self.genome.aligner is not None self.aligner = self.genome.aligner assert self.aligner is not None
def run(self): for rec in self.input_function_generator: out = Genome.to_fasta_string(rec) self.aligner_input.write(out) self.aligner_input.flush() self.aligner_input.close()
def __init__(self, config): self.config = config self.summary_filename = self.config.data_10x.data_10x_cell_summary self.bam_filename = self.config.data_10x.data_10x_bam self.bai_filename = self.config.data_10x.data_10x_bai assert os.path.exists(self.summary_filename), self.summary_filename assert os.path.exists(self.bam_filename), self.bam_filename assert os.path.exists(self.bai_filename), self.bai_filename self.summary_df = pd.read_csv(self.summary_filename, sep=',') self.barcodes = { k: v for (k, v) in self.summary_df[['barcode', 'cell_id']].to_records( index=False) } self.genome = Genome(self.config) assert self.genome is not None
class BinsPipeline(object): def __init__(self, config): self.config = config self.genome = Genome(self.config) def calc_bins_gc_content(self, chroms, bins_df): result = [] for chrom in chroms: chrom_df = bins_df[bins_df['bin.chrom'] == chrom] gc_df = chrom_df.copy() gc_df.reset_index(inplace=True, drop=True) gc_series = pd.Series(index=gc_df.index) chrom_seq = self.genome.load_chrom(chrom) for index, row in gc_df.iterrows(): start = row['bin.start'] end = row['bin.end'] seq = chrom_seq.seq[start:end] counts = [seq.count(x) for x in ['G', 'C', 'A', 'T']] total_counts = sum(counts) if total_counts == 0: gc = 0.0 else: gc = float(sum(counts[0:2])) / sum(counts) gc_series.iloc[index] = gc gc_df['gc.content'] = gc_series result.append(gc_df) assert len(result) > 0 if len(result) == 1: return result[0] df = pd.concat(result) return df def bins_boundaries_generator(self, chroms, mappable_regions_df): chrom_sizes = self.genome.chrom_sizes() chrom_bins = self.genome.calc_chrom_bins() # if mappable_regions_df is None: # mappable_regions_df = self.load_mappable_regions() for chrom in chroms: chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom] chrom_df = chrom_df.sort_values( by=['chrom', 'start_pos', 'end_pos']) params = BinParams.build(chrom_size=chrom_sizes[chrom], chrom_bin=chrom_bins[chrom]) mappable_bin = None current_excess = 0 bins_count = params.bins_count for row in chrom_df.to_dict(orient="records"): if mappable_bin is None: mappable_bin = MappableBin.from_start(params, start_pos=0) current_excess = mappable_bin.adapt_excess(current_excess) if not mappable_bin.check_extend(row): next_bin = mappable_bin.split_extend(row) bins_count -= 1 if bins_count == 0: # last bin a chromosome mappable_bin.end_pos = chrom_sizes[chrom].size yield mappable_bin if next_bin.is_overfill(): current_excess, mappable_bins = \ next_bin.overfill_split(current_excess) assert len(mappable_bins) > 1 for mb in mappable_bins[:-1]: bins_count -= 1 yield mb mappable_bin = mappable_bins[-1] else: mappable_bin = next_bin current_excess = \ mappable_bin.adapt_excess(current_excess) # print("mappable_bin:", row, mappable_bin) mappable_bin = None def calc_bins_boundaries(self, chroms=None, regions_df=None): if chroms is None: chroms = self.genome.version.CHROMS bin_rows = [] for mbin in self.bins_boundaries_generator(chroms, regions_df): # print("mbin:", mbin) bin_rows.append( (mbin.chrom, mbin.start_pos, mbin.start_abspos, mbin.end_pos, mbin.end_pos - mbin.start_pos, mbin.bin_size)) df = pd.DataFrame.from_records(bin_rows, columns=[ 'bin.chrom', 'bin.start', 'bin.start.abspos', 'bin.end', 'bin.length', 'mappable.positions' ]) df.sort_values(by=['bin.start.abspos'], inplace=True) return df def load_mappable_regions(self, chrom=None): filename = self.config.mappable_regions_filename(chrom=chrom) df = pd.read_csv(self.config.mappable_regions_filename(), names=['chrom', 'start_pos', 'end_pos'], sep='\t') df = df.sort_values(by=['chrom', 'start_pos', 'end_pos']) assert len(df) > 0 return df def run_once(self, chrom): print( colored(f"started calculating bins for chromosome {chrom}", "green")) regions_df = self.load_mappable_regions(chrom=chrom) bins_df = self.calc_bins_boundaries([chrom], regions_df) df = self.calc_bins_gc_content([chrom], bins_df) outfilename = self.config.bins_boundaries_filename(chrom) print( colored(f"saving bins for chromosome {chrom} into {outfilename}", "green")) df.to_csv(outfilename, sep='\t', index=False) return outfilename def concatenate_all_chroms(self): outfilename = self.config.bins_boundaries_filename() if os.path.exists(outfilename) and not self.config.force: print( colored( "destination bins boundaries file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if self.config.dry_run: return dataframes = [] for chrom in self.genome.version.CHROMS: srcfilename = self.config.bins_boundaries_filename(chrom) df = pd.read_csv(srcfilename, sep='\t') dataframes.append(df) outdf = pd.concat(dataframes, ignore_index=True) outdf.sort_values(by=['bin.start.abspos', 'bin.start', 'bin.end'], inplace=True) outdf.to_csv(outfilename, sep='\t', index=False) def run(self, dask_client): outfilename = self.config.bins_boundaries_filename() os.makedirs(os.path.dirname(outfilename), exist_ok=True) print( colored( "going to compute bin boundaries from mappable regions: {} " "into bins boundaries file {}".format( self.config.mappable_regions_filename(), outfilename), "green")) if os.path.exists(outfilename) and not self.config.force: print( colored( "output file {} already exists; " "use --force to overwrite".format(outfilename), "red")) raise ValueError("output file already exists") if self.config.dry_run: return assert self.genome.chrom_sizes() is not None delayed_tasks = dask_client.map(self.run_once, self.genome.version.CHROMS) print(len(delayed_tasks), delayed_tasks) print(dask_client.scheduler_info()) distributed.wait(delayed_tasks) for task in delayed_tasks: outfile = task.result() print(outfile, os.path.exists(outfile)) self.concatenate_all_chroms()
class MappableRegionsPipeline(object): def __init__(self, config, aligner=None): self.config = config self.genome = Genome(self.config) if aligner is not None: self.aligner = aligner else: assert self.genome.aligner is not None self.aligner = self.genome.aligner assert self.aligner is not None def mappable_regions_check(self, chroms, mappable_regions_df): for chrom in chroms: chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom] chrom_df = chrom_df.sort_values( by=['chrom', 'start_pos', 'end_pos']) start_pos_count = len(chrom_df.start_pos.unique()) if start_pos_count < len(chrom_df): LOG.error( "chrom {} has duplicate mappable regions".format(chrom)) def generate_reads(self, chroms, read_length): try: for chrom in chroms: seq_record = self.genome.load_chrom(chrom) for i in range(len(seq_record) - read_length + 1): seq = seq_record.seq[i:i + read_length] out_record = SeqRecord(seq, id="{}.{}".format(chrom, i + 1), description="generated_read") # if 'N' in seq: # print('skipping: ', out_record) # continue yield out_record finally: pass def generate_mappable_regions(self, chroms, read_length, outfile=None, aligner_options=[]): if outfile is None: outfile = sys.stdout reads_generator = self.generate_reads(chroms, read_length) def aligner_output_process_function(line): outfile.write(str(line)) outfile.write("\n") aligner_command = self.aligner.build_mappable_regions_command( options=aligner_options) print('aligner command:', ' '.join(aligner_command)) with Popen(aligner_command, stdout=PIPE, stdin=PIPE) as proc: control_queue = queue.Queue() input_thread = InputGeneratorThread(control_queue, proc.stdin, reads_generator) output_thread = AlignerOutputProcessingThread( control_queue, proc.stdout, aligner_output_process_function) input_thread.start() output_thread.start() while True: msg = None try: msg = control_queue.get() except queue.Empty: print("timeout - queue empty") msg = None if msg == 'out_done': print("output done") break if msg == 'in_done': print('input done') input_thread.join() output_thread.join() def mappable_regions_chrom_filename(self, chrom): mname = "{}_{}".format(chrom, self.config.mappable_regions.mappable_file) filename = os.path.join(self.config.mappable_regions.mappable_dir, mname) return filename def mappable_regions_filename(self): mname = self.config.mappable_regions.mappable_file filename = os.path.join(self.config.mappable_regions.mappable_dir, mname) return filename def run_once(self, chrom): outfilename = self.mappable_regions_chrom_filename(chrom) with open(outfilename, "w") as outfile: self.generate_mappable_regions([chrom], read_length=50, outfile=outfile) return outfilename def concatenate_all_chroms(self): dst = self.mappable_regions_filename() if os.path.exists(dst) and not self.config.force: print( colored( "destination mappable regions file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if not self.config.dry_run: with open(dst, 'wb') as output: for chrom in self.genome.version.CHROMS: src = self.mappable_regions_chrom_filename(chrom) print( colored("appending {} to {}".format(src, dst), "green")) with open(src, 'rb') as src: if not self.config.dry_run: shutil.copyfileobj(src, output, 1024 * 1024 * 10) def run(self, dask_client): outfilename = self.mappable_regions_filename() print( colored( "going to generate mappable regions with length {} " "from genome {} into {}".format( self.config.mappable_regions.mappable_read_length, self.config.genome.genome_dir, outfilename), "green")) if os.path.exists(outfilename) and not self.config.force: print( colored( "output file {} already exists; " "use --force to overwrite".format(outfilename), "red")) raise ValueError("output file already exists") genome_index_filenames = self.aligner.genome_index_filenames if not os.path.exists(genome_index_filenames[0]): print( colored( "genome index file {} not found".format( genome_index_filenames), "red")) raise ValueError("genome index file not found") if self.config.dry_run: return os.makedirs(self.config.mappable_regions.mappable_dir, exist_ok=True) assert dask_client delayed_tasks = dask_client.map(self.run_once, self.genome.version.CHROMS) distributed.wait(delayed_tasks) for fut in delayed_tasks: print("fut done:", fut.done()) print("fut exception:", fut.exception()) print("fut traceback:", fut.traceback()) print("fut result:", fut.result()) # if fut.traceback() is not None: # traceback.print_tb(fut.traceback()) # if fut.exception() is None: # print(fut.result()) self.concatenate_all_chroms()
def __init__(self, config): self.config = config self.hg = Genome(config)
def hg(tests_config): return Genome(tests_config)
class GenomeIndexPipeline(object): def __init__(self, config): self.config = config # assert self.config.genome.version == 'hg19' self.genome = Genome(self.config) assert self.genome.aligner is not None def copy_chromes_files(self): self.config.check_nonempty_workdir(self.config.genome.genome_dir) for chrom in self.genome.version.CHROMS_ALL: if chrom == 'chrY': continue src = os.path.join( self.config.genome.genome_pristine_dir, "{}.fa".format(chrom) ) dst = os.path.join( self.config.genome.genome_dir, "{}.fa".format(chrom) ) print(colored( "copying chromosome {} from {} into " "working directory {}".format( chrom, src, dst), "green")) if not self.config.dry_run: shutil.copy(src, dst) def mask_pars(self): dst = self.genome.chrom_filename('chrY') print(colored( "masking pseudoautosomal regions in chrY", "green") ) if os.path.exists(dst) and not self.config.force: print(colored( "destination file for masked chrY already exists", "red" )) raise ValueError("dst file already exists") if not self.config.dry_run: masked = self.genome.mask_chrY_pars() self.genome.save_chrom(masked, 'chrY') def concatenate_all_chroms(self): dirname = self.config.genome.genome_dir dst = os.path.join( dirname, 'genome.fa' ) if os.path.exists(dst) and not self.config.force: print(colored( "destination genome file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if not self.config.dry_run: with open(dst, 'wb') as output: for chrom in self.genome.version.CHROMS_ALL: src = self.genome.chrom_filename(chrom, pristine=False) print(colored( "appending {} to {}".format(src, dst), "green")) with open(src, 'rb') as src: if not self.config.dry_run: shutil.copyfileobj(src, output, 1024 * 1024 * 10) def build_aligner_index(self): print(colored( f"building genome index of {self.genome.sequence_filename} " f"into {self.genome.index_prefix}", "green")) command = " ".join(self.genome.aligner.build_index_command( self.genome.sequence_filename, self.genome.index_prefix )) print(colored( f"going to execute aligner genome index build: {command}", "green")) test_filename = self.genome.aligner.genome_index_filenames[0] print(colored(f"checking for index file: {test_filename}", "green")) if os.path.exists(test_filename) and not self.config.force: print(colored( "output genome index {} already exists".format(test_filename), "red")) raise ValueError("destination file already exists") if not self.config.dry_run: subprocess.check_call(command, shell=True) def run(self, **kwargs): self.copy_chromes_files() self.mask_pars() self.concatenate_all_chroms() self.build_aligner_index()
def __init__(self, config): self.config = config # assert self.config.genome.version == 'hg19' self.genome = Genome(self.config) assert self.genome.aligner is not None
class GenomeIndexPipeline(object): def __init__(self, config): self.config = config # assert self.config.genome.version == 'hg19' self.hg = Genome(self.config) def copy_chromes_files(self): self.config.check_nonempty_workdir( self.config.abspath(self.config.genome.work_dir)) for chrom in self.hg.version.CHROMS_ALL: if chrom == 'chrY': continue src = os.path.join( self.config.genome.data_dir, "{}.fa".format(chrom) ) dst = os.path.join( self.config.genome.work_dir, "{}.fa".format(chrom) ) print(colored( "copying chromosome {} from {} into " "working directory {}".format( chrom, src, dst), "green")) if not self.config.dry_run: shutil.copy(src, dst) def mask_pars(self): dst = self.config.chrom_filename('chrY') print(colored( "masking pseudoautosomal regions in chrY", "green") ) if os.path.exists(dst) and not self.config.force: print(colored( "destination file for masked chrY already exists", "red" )) raise ValueError("dst file already exists") if not self.config.dry_run: masked = self.hg.mask_chrY_pars() self.hg.save_chrom(masked, 'chrY') def concatenate_all_chroms(self): dirname = self.config.genome.work_dir dst = os.path.join( dirname, 'genome.fa' ) if os.path.exists(dst) and not self.config.force: print(colored( "destination genome file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if not self.config.dry_run: with open(dst, 'wb') as output: for chrom in self.hg.version.CHROMS_ALL: src = self.config.chrom_filename(chrom, pristine=False) print(colored( "appending {} to {}".format(src, dst), "green")) with open(src, 'rb') as src: if not self.config.dry_run: shutil.copyfileobj(src, output, 1024 * 1024 * 10) def build_bowtie_index(self): src = os.path.join( self.config.genome.work_dir, 'genome.fa' ) dst = os.path.join( self.config.genome.work_dir, self.config.genome.index ) print(colored( "building bowtie index of {} into {}".format(src, dst), "green")) command = "bowtie-build -f {} {}".format(src, dst) print(colored( "executing bowtie-build: {}".format(command), "green")) test_filename = "{}.1.bt2".format(dst) if os.path.exists(test_filename) and not self.config.force: print(colored( "output bowtie index {} already exists".format(test_filename), "red")) raise ValueError("destination file already exists") if not self.config.dry_run: subprocess.check_call(command, shell=True) def run(self): self.copy_chromes_files() self.mask_pars() self.concatenate_all_chroms() self.build_bowtie_index()
def __init__(self, config): self.config = config # assert self.config.genome.version == 'hg19' self.hg = Genome(self.config)
async def async_write_fasta(outfile, rec): out = Genome.to_fasta_string(rec) outfile.write(out) await outfile.drain()
class MappableRegionsPipeline(object): def __init__(self, config): self.config = config self.hg = Genome(self.config) def mappable_regions_check(self, chroms, mappable_regions_df): # if mappable_regions_df is None: # mappable_regions_df = self.load_mappable_regions() for chrom in chroms: chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom] chrom_df = chrom_df.sort_values( by=['chrom', 'start_pos', 'end_pos']) start_pos_count = len(chrom_df.start_pos.unique()) if start_pos_count < len(chrom_df): LOG.error( "chrom {} has duplicate mappable regions".format(chrom)) def generate_reads(self, chroms, read_length): try: for chrom in chroms: seq_record = self.hg.load_chrom(chrom) for i in range(len(seq_record) - read_length + 1): out_record = SeqRecord(seq_record.seq[i:i + read_length], id="{}.{}".format(chrom, i + 1), description="generated_read") yield out_record finally: pass async def async_start_bowtie(self, bowtie_opts=""): genomeindex = self.config.genome_index_filename() if bowtie_opts: command = [ 'bowtie', '-S', '-t', '-v', '0', '-m', '1', *bowtie_opts.split(' '), '-f', genomeindex, '-', ] else: command = [ 'bowtie', '-S', '-t', '-v', '0', '-m', '1', '-f', genomeindex, '-', ] print( colored("going to execute bowtie: {}".format(" ".join(command)), "green")) create = asyncio.create_subprocess_exec( *command, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, ) proc = await create return proc @staticmethod async def async_write_fasta(outfile, rec): out = Genome.to_fasta_string(rec) outfile.write(out) await outfile.drain() async def async_write_reads_generator(self, out, reads_generator): for rec in reads_generator: await self.async_write_fasta(out, rec) out.close() async def async_mappings_generator(self, reads_generator, bowtie): writer = asyncio.Task( self.async_write_reads_generator(bowtie.stdin, reads_generator)) while True: line = await bowtie.stdout.readline() if not line: break yield line.decode() await bowtie.wait() await writer async def async_generate_mappings(self, chroms, read_length, outfile=None): if outfile is None: outfile = sys.stdout bowtie = await self.async_start_bowtie() reads_generator = self.generate_reads(chroms, read_length) async for mappings in self.async_mappings_generator( reads_generator, bowtie): outfile.write(mappings) async def async_generate_mappable_regions(self, chroms, read_length, outfile=None, bowtie_opts=""): bowtie = await self.async_start_bowtie(bowtie_opts=bowtie_opts) reads_generator = self.generate_reads(chroms, read_length) writer = asyncio.Task( self.async_write_reads_generator(bowtie.stdin, reads_generator)) if outfile is None: outfile = sys.stdout async for mapping in self.async_mappable_regions_generator( bowtie.stdout): outfile.write(str(mapping)) outfile.write('\n') await bowtie.wait() await writer async def async_mappable_regions_generator(self, infile): prev = None state = MappableState.OUT while True: line = await infile.readline() if not line: break line = line.decode() if line[0] == '@': # comment continue mapping = Mapping.parse_sam(line) if state == MappableState.OUT: if mapping.flag == 0: prev = MappableRegion(mapping) state = MappableState.IN else: if mapping.flag == 0: if mapping.chrom == prev.chrom: prev.extend(mapping.start) else: yield prev prev = MappableRegion(mapping) else: yield prev state = MappableState.OUT if state == MappableState.IN: yield prev def run_once(self, chrom): event_loop = asyncio.get_event_loop() # LOG.info('enabling debugging') # Enable debugging # event_loop.set_debug(True) outfilename = self.config.mappable_regions_filename(chrom) with open(outfilename, "w") as outfile: event_loop.run_until_complete( self.async_generate_mappable_regions( [chrom], self.config.mappable_regions.length, outfile=outfile, bowtie_opts=self.config.mappable_regions.bowtie_opts)) def concatenate_all_chroms(self): dst = self.config.mappable_regions_filename() if os.path.exists(dst) and not self.config.force: print( colored( "destination mappable regions file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if not self.config.dry_run: with open(dst, 'wb') as output: for chrom in self.hg.version.CHROMS: src = self.config.mappable_regions_filename(chrom) print( colored("appending {} to {}".format(src, dst), "green")) with open(src, 'rb') as src: if not self.config.dry_run: shutil.copyfileobj(src, output, 1024 * 1024 * 10) def run(self): outfilename = self.config.mappable_regions_filename() print( colored( "going to generate mappable regions with length {} " "from genome {} into {}".format( self.config.mappable_regions.length, self.config.genome.work_dir, outfilename), "green")) if os.path.exists(outfilename) and not self.config.force: print( colored( "output file {} already exists; " "use --force to overwrite".format(outfilename), "red")) raise ValueError("output file already exists") if not self.config.genome_index_filename_exists(): print( colored( "genome index file {} not found".format( self.config.genome_index_filename()), "red")) raise ValueError("genome index file not found") if self.config.dry_run: return if not os.path.exists(self.config.mappable_regions.work_dir): os.makedirs(self.config.mappable_regions.work_dir) pool = multiprocessing.Pool(processes=self.config.parallel) pool.map(self.run_once, self.hg.version.CHROMS) self.concatenate_all_chroms()
class VarbinPipeline(object): def __init__(self, config): self.config = config self.genome = Genome(config) def find_bin_index(self, abspos, bins): index = np.searchsorted(abspos, bins, side='right') index = index - 1 return index def mapping_all_filenames(self): pattern = os.path.join( self.config.mapping.mapping_dir, "*{}".format(self.config.mapping.mapping_suffix)) filenames = glob.glob(pattern) return filenames def find_bin_index_binsearch(self, bins, abspos): index_up = len(bins) index_down = 0 index_mid = int((index_up - index_down) / 2.0) while True: if abspos >= int(bins[index_mid]): index_down = index_mid + 0 index_mid = int((index_up - index_down) / 2.0) + index_mid else: index_up = index_mid + 0 index_mid = int((index_up - index_down) / 2.0) + index_down if index_up - index_down < 2: break return index_down def varbin(self, filename): try: assert os.path.exists(filename), os.path.abspath(filename) infile = pysam.AlignmentFile(filename, 'rb') bins_df = self.genome.bins_boundaries() assert bins_df is not None chrom_sizes = self.genome.chrom_sizes() chroms = set(self.genome.version.CHROMS) count = 0 dups = 0 total_reads = 0 prev_pos = 0 bin_counts = defaultdict(int) bins = bins_df['bin.start.abspos'].values for seg in infile: total_reads += 1 if seg.is_unmapped: continue chrom = seg.reference_name if chrom not in chroms: continue if seg.cigarstring != f'{seg.reference_length}M': print("non exact mapping:", seg, seg.cigarstring) continue assert seg.cigarstring == f'{seg.reference_length}M', \ (seg, seg.cigarstring) abspos = chrom_sizes[chrom].abspos + seg.reference_start if prev_pos == abspos: dups += 1 continue count += 1 index = self.find_bin_index_binsearch(bins, abspos) bin_counts[index] += 1 prev_pos = abspos result = [] for index, row in bins_df.iterrows(): bin_count = bin_counts[index] result.append([ row['bin.chrom'], row['bin.start'], row['bin.start.abspos'], bin_count, ]) df = pd.DataFrame.from_records(result, columns=[ 'chrom', 'chrompos', 'abspos', 'bincount', ]) df.sort_values(by=['abspos'], inplace=True) total_count = df.bincount.sum() total_reads_per_bin = float(total_count) / len(bins_df) df['ratio'] = df.bincount / total_reads_per_bin return df except Exception as ex: traceback.print_exc() raise ex return None def run_once(self, mapping_filename): cellname = self.config.cellname(mapping_filename) outfile = self.config.varbin_filename(cellname) print( colored( "processing cell {}; reading from {}; writing to {}".format( cellname, mapping_filename, outfile), "green")) if os.path.exists(outfile) and not self.config.force: print( colored( "output file {} exists; add --force to overwrite".format( outfile), "red")) else: if not self.config.dry_run: df = self.varbin(mapping_filename) df.to_csv(outfile, index=False, sep='\t') def run(self, dask_client): mapping_filenames = self.mapping_all_filenames() print( colored("processing files: {}".format(mapping_filenames), "green")) if self.config.dry_run: return assert dask_client os.makedirs(self.config.varbin.varbin_dir, exist_ok=True) delayed_tasks = dask_client.map(self.run_once, mapping_filenames) distributed.wait(delayed_tasks)
def __init__(self, config): self.config = config self.genome = Genome(self.config)
def tests_genome(tests_config): genome = Genome(tests_config) assert genome is not None assert genome.version.VERSION == 'hg19' return genome
def hg(): config = Config.load("tests/data/scpipe_tests.yml", use_config_dir=True) return Genome(config)
class VarbinPipeline(object): def __init__(self, config): self.config = config self.hg = Genome(config) def find_bin_index(self, abspos, bins): index = np.searchsorted(abspos, bins, side='right') index = index - 1 return index def find_bin_index_binsearch(self, bins, abspos): index_up = len(bins) index_down = 0 index_mid = int((index_up - index_down) / 2.0) while True: if abspos >= int(bins[index_mid]): index_down = index_mid + 0 index_mid = int((index_up - index_down) / 2.0) + index_mid else: index_up = index_mid + 0 index_mid = int((index_up - index_down) / 2.0) + index_down if index_up - index_down < 2: break return index_down def varbin(self, filename): try: assert os.path.exists(filename), os.path.abspath(filename) infile = pysam.AlignmentFile(filename, 'rb') bins_df = self.hg.bins_boundaries() assert bins_df is not None chrom_sizes = self.hg.chrom_sizes() chroms = set(self.hg.version.CHROMS) count = 0 dups = 0 total_reads = 0 prev_pos = 0 bin_counts = defaultdict(int) bins = bins_df['bin.start.abspos'].values for seg in infile: total_reads += 1 if seg.is_unmapped: continue chrom = seg.reference_name if chrom not in chroms: continue abspos = chrom_sizes[chrom].abspos + seg.reference_start if prev_pos == abspos: dups += 1 continue count += 1 index = self.find_bin_index_binsearch(bins, abspos) bin_counts[index] += 1 prev_pos = abspos except Exception: traceback.print_exc() number_of_reads_per_bin = float(count) / len(bins_df) result = [] for index, row in bins_df.iterrows(): bin_count = bin_counts[index] ratio = float(bin_count) / number_of_reads_per_bin result.append([ row['bin.chrom'], row['bin.start'], row['bin.start.abspos'], bin_count, ratio ]) df = pd.DataFrame.from_records(result, columns=[ 'chrom', 'chrompos', 'abspos', 'bincount', 'ratio', ]) df.sort_values(by=['abspos'], inplace=True) return df def run_once(self, mapping_filename): cellname = self.config.cellname(mapping_filename) outfile = self.config.varbin_filename(cellname) print( colored( "processing cell {}; reading from {}; writing to {}".format( cellname, mapping_filename, outfile), "green")) if os.path.exists(outfile) and not self.config.force: print( colored( "output file {} exists; add --force to overwrite".format( outfile), "red")) else: if not self.config.dry_run: df = self.varbin(mapping_filename) df.to_csv(outfile, index=False, sep='\t') def run(self): mapping_filenames = self.config.mapping_filenames() print( colored("processing files: {}".format(mapping_filenames), "green")) pool = multiprocessing.Pool(processes=self.config.parallel) pool.map(self.run_once, mapping_filenames)