class MappableRegionsPipeline(object): def __init__(self, config, aligner=None): self.config = config self.genome = Genome(self.config) if aligner is not None: self.aligner = aligner else: assert self.genome.aligner is not None self.aligner = self.genome.aligner assert self.aligner is not None def mappable_regions_check(self, chroms, mappable_regions_df): for chrom in chroms: chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom] chrom_df = chrom_df.sort_values( by=['chrom', 'start_pos', 'end_pos']) start_pos_count = len(chrom_df.start_pos.unique()) if start_pos_count < len(chrom_df): LOG.error( "chrom {} has duplicate mappable regions".format(chrom)) def generate_reads(self, chroms, read_length): try: for chrom in chroms: seq_record = self.genome.load_chrom(chrom) for i in range(len(seq_record) - read_length + 1): seq = seq_record.seq[i:i + read_length] out_record = SeqRecord(seq, id="{}.{}".format(chrom, i + 1), description="generated_read") # if 'N' in seq: # print('skipping: ', out_record) # continue yield out_record finally: pass def generate_mappable_regions(self, chroms, read_length, outfile=None, aligner_options=[]): if outfile is None: outfile = sys.stdout reads_generator = self.generate_reads(chroms, read_length) def aligner_output_process_function(line): outfile.write(str(line)) outfile.write("\n") aligner_command = self.aligner.build_mappable_regions_command( options=aligner_options) print('aligner command:', ' '.join(aligner_command)) with Popen(aligner_command, stdout=PIPE, stdin=PIPE) as proc: control_queue = queue.Queue() input_thread = InputGeneratorThread(control_queue, proc.stdin, reads_generator) output_thread = AlignerOutputProcessingThread( control_queue, proc.stdout, aligner_output_process_function) input_thread.start() output_thread.start() while True: msg = None try: msg = control_queue.get() except queue.Empty: print("timeout - queue empty") msg = None if msg == 'out_done': print("output done") break if msg == 'in_done': print('input done') input_thread.join() output_thread.join() def mappable_regions_chrom_filename(self, chrom): mname = "{}_{}".format(chrom, self.config.mappable_regions.mappable_file) filename = os.path.join(self.config.mappable_regions.mappable_dir, mname) return filename def mappable_regions_filename(self): mname = self.config.mappable_regions.mappable_file filename = os.path.join(self.config.mappable_regions.mappable_dir, mname) return filename def run_once(self, chrom): outfilename = self.mappable_regions_chrom_filename(chrom) with open(outfilename, "w") as outfile: self.generate_mappable_regions([chrom], read_length=50, outfile=outfile) return outfilename def concatenate_all_chroms(self): dst = self.mappable_regions_filename() if os.path.exists(dst) and not self.config.force: print( colored( "destination mappable regions file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if not self.config.dry_run: with open(dst, 'wb') as output: for chrom in self.genome.version.CHROMS: src = self.mappable_regions_chrom_filename(chrom) print( colored("appending {} to {}".format(src, dst), "green")) with open(src, 'rb') as src: if not self.config.dry_run: shutil.copyfileobj(src, output, 1024 * 1024 * 10) def run(self, dask_client): outfilename = self.mappable_regions_filename() print( colored( "going to generate mappable regions with length {} " "from genome {} into {}".format( self.config.mappable_regions.mappable_read_length, self.config.genome.genome_dir, outfilename), "green")) if os.path.exists(outfilename) and not self.config.force: print( colored( "output file {} already exists; " "use --force to overwrite".format(outfilename), "red")) raise ValueError("output file already exists") genome_index_filenames = self.aligner.genome_index_filenames if not os.path.exists(genome_index_filenames[0]): print( colored( "genome index file {} not found".format( genome_index_filenames), "red")) raise ValueError("genome index file not found") if self.config.dry_run: return os.makedirs(self.config.mappable_regions.mappable_dir, exist_ok=True) assert dask_client delayed_tasks = dask_client.map(self.run_once, self.genome.version.CHROMS) distributed.wait(delayed_tasks) for fut in delayed_tasks: print("fut done:", fut.done()) print("fut exception:", fut.exception()) print("fut traceback:", fut.traceback()) print("fut result:", fut.result()) # if fut.traceback() is not None: # traceback.print_tb(fut.traceback()) # if fut.exception() is None: # print(fut.result()) self.concatenate_all_chroms()
class BinsPipeline(object): def __init__(self, config): self.config = config self.genome = Genome(self.config) def calc_bins_gc_content(self, chroms, bins_df): result = [] for chrom in chroms: chrom_df = bins_df[bins_df['bin.chrom'] == chrom] gc_df = chrom_df.copy() gc_df.reset_index(inplace=True, drop=True) gc_series = pd.Series(index=gc_df.index) chrom_seq = self.genome.load_chrom(chrom) for index, row in gc_df.iterrows(): start = row['bin.start'] end = row['bin.end'] seq = chrom_seq.seq[start:end] counts = [seq.count(x) for x in ['G', 'C', 'A', 'T']] total_counts = sum(counts) if total_counts == 0: gc = 0.0 else: gc = float(sum(counts[0:2])) / sum(counts) gc_series.iloc[index] = gc gc_df['gc.content'] = gc_series result.append(gc_df) assert len(result) > 0 if len(result) == 1: return result[0] df = pd.concat(result) return df def bins_boundaries_generator(self, chroms, mappable_regions_df): chrom_sizes = self.genome.chrom_sizes() chrom_bins = self.genome.calc_chrom_bins() # if mappable_regions_df is None: # mappable_regions_df = self.load_mappable_regions() for chrom in chroms: chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom] chrom_df = chrom_df.sort_values( by=['chrom', 'start_pos', 'end_pos']) params = BinParams.build(chrom_size=chrom_sizes[chrom], chrom_bin=chrom_bins[chrom]) mappable_bin = None current_excess = 0 bins_count = params.bins_count for row in chrom_df.to_dict(orient="records"): if mappable_bin is None: mappable_bin = MappableBin.from_start(params, start_pos=0) current_excess = mappable_bin.adapt_excess(current_excess) if not mappable_bin.check_extend(row): next_bin = mappable_bin.split_extend(row) bins_count -= 1 if bins_count == 0: # last bin a chromosome mappable_bin.end_pos = chrom_sizes[chrom].size yield mappable_bin if next_bin.is_overfill(): current_excess, mappable_bins = \ next_bin.overfill_split(current_excess) assert len(mappable_bins) > 1 for mb in mappable_bins[:-1]: bins_count -= 1 yield mb mappable_bin = mappable_bins[-1] else: mappable_bin = next_bin current_excess = \ mappable_bin.adapt_excess(current_excess) # print("mappable_bin:", row, mappable_bin) mappable_bin = None def calc_bins_boundaries(self, chroms=None, regions_df=None): if chroms is None: chroms = self.genome.version.CHROMS bin_rows = [] for mbin in self.bins_boundaries_generator(chroms, regions_df): # print("mbin:", mbin) bin_rows.append( (mbin.chrom, mbin.start_pos, mbin.start_abspos, mbin.end_pos, mbin.end_pos - mbin.start_pos, mbin.bin_size)) df = pd.DataFrame.from_records(bin_rows, columns=[ 'bin.chrom', 'bin.start', 'bin.start.abspos', 'bin.end', 'bin.length', 'mappable.positions' ]) df.sort_values(by=['bin.start.abspos'], inplace=True) return df def load_mappable_regions(self, chrom=None): filename = self.config.mappable_regions_filename(chrom=chrom) df = pd.read_csv(self.config.mappable_regions_filename(), names=['chrom', 'start_pos', 'end_pos'], sep='\t') df = df.sort_values(by=['chrom', 'start_pos', 'end_pos']) assert len(df) > 0 return df def run_once(self, chrom): print( colored(f"started calculating bins for chromosome {chrom}", "green")) regions_df = self.load_mappable_regions(chrom=chrom) bins_df = self.calc_bins_boundaries([chrom], regions_df) df = self.calc_bins_gc_content([chrom], bins_df) outfilename = self.config.bins_boundaries_filename(chrom) print( colored(f"saving bins for chromosome {chrom} into {outfilename}", "green")) df.to_csv(outfilename, sep='\t', index=False) return outfilename def concatenate_all_chroms(self): outfilename = self.config.bins_boundaries_filename() if os.path.exists(outfilename) and not self.config.force: print( colored( "destination bins boundaries file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if self.config.dry_run: return dataframes = [] for chrom in self.genome.version.CHROMS: srcfilename = self.config.bins_boundaries_filename(chrom) df = pd.read_csv(srcfilename, sep='\t') dataframes.append(df) outdf = pd.concat(dataframes, ignore_index=True) outdf.sort_values(by=['bin.start.abspos', 'bin.start', 'bin.end'], inplace=True) outdf.to_csv(outfilename, sep='\t', index=False) def run(self, dask_client): outfilename = self.config.bins_boundaries_filename() os.makedirs(os.path.dirname(outfilename), exist_ok=True) print( colored( "going to compute bin boundaries from mappable regions: {} " "into bins boundaries file {}".format( self.config.mappable_regions_filename(), outfilename), "green")) if os.path.exists(outfilename) and not self.config.force: print( colored( "output file {} already exists; " "use --force to overwrite".format(outfilename), "red")) raise ValueError("output file already exists") if self.config.dry_run: return assert self.genome.chrom_sizes() is not None delayed_tasks = dask_client.map(self.run_once, self.genome.version.CHROMS) print(len(delayed_tasks), delayed_tasks) print(dask_client.scheduler_info()) distributed.wait(delayed_tasks) for task in delayed_tasks: outfile = task.result() print(outfile, os.path.exists(outfile)) self.concatenate_all_chroms()
class MappableRegionsPipeline(object): def __init__(self, config): self.config = config self.hg = Genome(self.config) def mappable_regions_check(self, chroms, mappable_regions_df): # if mappable_regions_df is None: # mappable_regions_df = self.load_mappable_regions() for chrom in chroms: chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom] chrom_df = chrom_df.sort_values( by=['chrom', 'start_pos', 'end_pos']) start_pos_count = len(chrom_df.start_pos.unique()) if start_pos_count < len(chrom_df): LOG.error( "chrom {} has duplicate mappable regions".format(chrom)) def generate_reads(self, chroms, read_length): try: for chrom in chroms: seq_record = self.hg.load_chrom(chrom) for i in range(len(seq_record) - read_length + 1): out_record = SeqRecord(seq_record.seq[i:i + read_length], id="{}.{}".format(chrom, i + 1), description="generated_read") yield out_record finally: pass async def async_start_bowtie(self, bowtie_opts=""): genomeindex = self.config.genome_index_filename() if bowtie_opts: command = [ 'bowtie', '-S', '-t', '-v', '0', '-m', '1', *bowtie_opts.split(' '), '-f', genomeindex, '-', ] else: command = [ 'bowtie', '-S', '-t', '-v', '0', '-m', '1', '-f', genomeindex, '-', ] print( colored("going to execute bowtie: {}".format(" ".join(command)), "green")) create = asyncio.create_subprocess_exec( *command, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, ) proc = await create return proc @staticmethod async def async_write_fasta(outfile, rec): out = Genome.to_fasta_string(rec) outfile.write(out) await outfile.drain() async def async_write_reads_generator(self, out, reads_generator): for rec in reads_generator: await self.async_write_fasta(out, rec) out.close() async def async_mappings_generator(self, reads_generator, bowtie): writer = asyncio.Task( self.async_write_reads_generator(bowtie.stdin, reads_generator)) while True: line = await bowtie.stdout.readline() if not line: break yield line.decode() await bowtie.wait() await writer async def async_generate_mappings(self, chroms, read_length, outfile=None): if outfile is None: outfile = sys.stdout bowtie = await self.async_start_bowtie() reads_generator = self.generate_reads(chroms, read_length) async for mappings in self.async_mappings_generator( reads_generator, bowtie): outfile.write(mappings) async def async_generate_mappable_regions(self, chroms, read_length, outfile=None, bowtie_opts=""): bowtie = await self.async_start_bowtie(bowtie_opts=bowtie_opts) reads_generator = self.generate_reads(chroms, read_length) writer = asyncio.Task( self.async_write_reads_generator(bowtie.stdin, reads_generator)) if outfile is None: outfile = sys.stdout async for mapping in self.async_mappable_regions_generator( bowtie.stdout): outfile.write(str(mapping)) outfile.write('\n') await bowtie.wait() await writer async def async_mappable_regions_generator(self, infile): prev = None state = MappableState.OUT while True: line = await infile.readline() if not line: break line = line.decode() if line[0] == '@': # comment continue mapping = Mapping.parse_sam(line) if state == MappableState.OUT: if mapping.flag == 0: prev = MappableRegion(mapping) state = MappableState.IN else: if mapping.flag == 0: if mapping.chrom == prev.chrom: prev.extend(mapping.start) else: yield prev prev = MappableRegion(mapping) else: yield prev state = MappableState.OUT if state == MappableState.IN: yield prev def run_once(self, chrom): event_loop = asyncio.get_event_loop() # LOG.info('enabling debugging') # Enable debugging # event_loop.set_debug(True) outfilename = self.config.mappable_regions_filename(chrom) with open(outfilename, "w") as outfile: event_loop.run_until_complete( self.async_generate_mappable_regions( [chrom], self.config.mappable_regions.length, outfile=outfile, bowtie_opts=self.config.mappable_regions.bowtie_opts)) def concatenate_all_chroms(self): dst = self.config.mappable_regions_filename() if os.path.exists(dst) and not self.config.force: print( colored( "destination mappable regions file already exists" "use --force to overwrite", "red")) raise ValueError("destination file exists... use --force") if not self.config.dry_run: with open(dst, 'wb') as output: for chrom in self.hg.version.CHROMS: src = self.config.mappable_regions_filename(chrom) print( colored("appending {} to {}".format(src, dst), "green")) with open(src, 'rb') as src: if not self.config.dry_run: shutil.copyfileobj(src, output, 1024 * 1024 * 10) def run(self): outfilename = self.config.mappable_regions_filename() print( colored( "going to generate mappable regions with length {} " "from genome {} into {}".format( self.config.mappable_regions.length, self.config.genome.work_dir, outfilename), "green")) if os.path.exists(outfilename) and not self.config.force: print( colored( "output file {} already exists; " "use --force to overwrite".format(outfilename), "red")) raise ValueError("output file already exists") if not self.config.genome_index_filename_exists(): print( colored( "genome index file {} not found".format( self.config.genome_index_filename()), "red")) raise ValueError("genome index file not found") if self.config.dry_run: return if not os.path.exists(self.config.mappable_regions.work_dir): os.makedirs(self.config.mappable_regions.work_dir) pool = multiprocessing.Pool(processes=self.config.parallel) pool.map(self.run_once, self.hg.version.CHROMS) self.concatenate_all_chroms()