def __init__(self, args=None, region=None, name='hg19'): self.genome_name = name self.chrom = None self.sites = None self.region_str = None self.bp_tuple = None self.chrs_sz = None # DataFrame of chromosomes sizes (in number of sites) self.name = name self.args = args # todo: this could be prettier if args is not None: self.name = args.genome self.genome = GenomeRefPaths(self.name) if args.sites: self.parse_sites(args.sites) elif args.region: self.parse_region(args.region) elif region is not None: self.genome = GenomeRefPaths(self.name) self.parse_region(region) else: raise IllegalArgumentError('Invalid GR init {}'.format(region)) self.nr_sites = None if self.sites is None else self.sites[ 1] - self.sites[0] self.annotation = self.add_anno()
def __init__(self, args, betas): self.betas = betas max_cpg = min(args.max_cpg, args.max_bp // 2) assert (max_cpg > 1) self.genome = GenomeRefPaths(args.genome) self.param_dict = { 'betas': betas, 'pcount': args.pcount, 'max_cpg': max_cpg, 'max_bp': args.max_bp, 'revdict': self.genome.revdict_path, 'genome': self.genome } self.args = args
def set_lists(self): # black/white lists: blacklist = self.args.blacklist whitelist = self.args.whitelist if blacklist == True: blacklist = GenomeRefPaths(self.args.genome).blacklist elif whitelist == True: whitelist = GenomeRefPaths(self.args.genome).whitelist if blacklist: validate_single_file(blacklist) elif whitelist: validate_single_file(whitelist) if self.verbose: eprint(f'[wt bam2pat] blacklist: {blacklist}') eprint(f'[wt bam2pat] whitelist: {whitelist}') return blacklist, whitelist
def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}') suff = '.lbeta' if args.lbeta else '.beta' out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff) if not delete_or_skip(out_beta, force): return if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): arr = mult_pat2beta(pat_path, args) else: nr_sites = GenomeRefPaths(args.genome).get_nr_sites() cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}' x = subprocess.check_output(cmd, shell=True).decode() arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2)) trim_to_uint8(arr, args.lbeta).tofile(out_beta) return out_beta
def __init__(self, args): self.args = args self.out_path = args.out_path self.debug = args.debug if not delete_or_skip(self.out_path, self.args.force): return # load bed file: self.df = load_bed(args.bed_path, 100000 if self.debug else None) self.genome = GenomeRefPaths(args.genome) # load chromosomes sizes (in GpGs): self.cf = self.genome.get_chrom_cpg_size_table() self.cf['size'] = np.cumsum(self.cf['size']) self.proc_bed()
def add_cpgs_to_bed(bed_file, genome, drop_empty, threads, add_anno): # load bed file: df = load_bed(bed_file) # load chromosomes sizes (in GpGs): cf = GenomeRefPaths(genome).get_chrom_cpg_size_table() cf['size'] = np.cumsum(cf['size']) chroms = sorted(set(cf.chr) & set(df.chr)) params = [(df[df.chr == chrom].copy(), cf, genome) for chrom in chroms] p = Pool(threads) arr = p.starmap(chr_thread, params) p.close() p.join() # concat chromosomes r = pd.concat(arr) # merge with original table, to keep the order and the empty regions r = df.merge(r[COORDS_COLS5], how='left', on=COORDS_COLS3) r = r[COORDS_COLS5 + list(r.columns)[3:-2]] # add annotations: if add_anno: r = get_anno(r, genome, bed_file) # drop regions w/o CpGs if drop_empty: r.dropna(inplace=True, subset=['startCpG', 'endCpG']) return r
def load_blocks_file(self): if self.verbose: eprint('loading blocks...') names = ['chr', 'start', 'end', 'startCpG', 'endCpG'] cols = range(len(names)) nrows = DEBUG_NR if self.args.debug else None blocks_path = self.args.blocks_path if blocks_path is None: blocks_path = GenomeRefPaths().blocks df = pd.read_csv(self.args.blocks_path, sep='\t', header=None, names=names, nrows=nrows, usecols=cols) df['lenCpG'] = df['endCpG'] - df['startCpG'] self.keepinds = df['lenCpG'] >= self.args.min_cpg self.orig_nr_blocks = df.shape[0] df = df[self.keepinds].reset_index(drop=True) self.nr_blocks = df.shape[0] if self.verbose: eprint(f'loaded {self.orig_nr_blocks:,}') if self.nr_blocks != self.orig_nr_blocks: eprint( f'droppd to {self.nr_blocks:,} with >={self.args.min_cpg} CpGs' ) return df
def add_bed_to_cpgs(site_file, genome, out_path=None): validate_local_exe(add_loci_tool) g = GenomeRefPaths(genome) cmd = f'cat {site_file} | {add_loci_tool} {g.dict_path} {g.chrom_cpg_sizes}' if (out_path is not None) and out_path != sys.stdout: cmd += f' > {out_path}' subprocess.check_call(cmd, shell=True)
def __init__(self, args): self.args = args self.gr = GenomicRegion(args) self.outdir = args.outdir self.name = '' if not op.isdir(self.outdir): raise IllegalArgumentError('Invalid output directory: ' + self.outdir) self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes
def __init__(self, args): self.args = args self.gr = GenomicRegion(args) self.debug = args.debug self.outdir = args.outdir if not op.isdir(self.outdir): raise IllegalArgumentError('Invalid output directory: ' + self.outdir) self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes self.ref_dict = self.load_dict()
def __init__(self, args=None, region=None, sites=None, genome_name=None): self.genome_name = get_genome_name(genome_name) self.chrom = None self.sites = sites self.region_str = region self.bp_tuple = None self.args = args # todo: this could be prettier if args is not None: self.genome_name = get_genome_name(args.genome) self.genome = GenomeRefPaths(self.genome_name) if args.sites: self.parse_sites(args.sites) elif args.region: self.parse_region(args.region) elif region is not None: self.genome = GenomeRefPaths(self.genome_name) self.parse_region(region) elif sites is not None: self.genome = GenomeRefPaths(self.genome_name) self.parse_sites(sites) else: raise IllegalArgumentError(f'Invalid GR init {region}') self.nr_sites = None if self.sites is None else self.sites[ 1] - self.sites[0] self.annotation = self.add_anno()
def get_anno(df, genome, bed_file): try: anno_path = GenomeRefPaths(genome).annotations if anno_path is None: return df cmd = f'cut -f1-3 {bed_file} | sort -k1,1 -k2,2n -u | ' cmd += f'bedtools intersect -a - -b {anno_path} -wao | ' cmd += f'bedtools merge -i - -c 7,8 -o distinct,distinct' names = COORDS_COLS3 + ['type', 'gene'] rf = read_shell(cmd, names=names) return df.merge(rf, how='left', on=COORDS_COLS3) except Exception as e: eprint(f'[wt convert] WARNING: No annotations added.') eprint(e) return df
def mult_pat2beta(pat_path, args): processes = [] with Pool(args.threads) as p: ct = GenomeRefPaths(args.genome).get_chrom_cpg_size_table() x = np.cumsum([0] + list(ct['size'])) + 1 chroms = list(ct['chr']) for i, chrom in enumerate(chroms): start = x[i] end = x[i + 1] params = (pat_path, chrom, start, end) processes.append(p.apply_async(chr_thread, params)) p.close() p.join() beta_files = [pr.get() for pr in processes] res = np.concatenate(beta_files, axis=0) return res
def mult_pat2beta(pat_path, out_beta, nr_sites, args): processes = [] with Pool(args.threads) as p: chroms = list( GenomeRefPaths(args.genome).get_chrom_cpg_size_table()['chr']) for chrom in sorted(chroms): beta = '{}.{}.beta'.format(op.splitext(out_beta)[0], chrom) params = (chrom, pat_path, beta, nr_sites) processes.append(p.apply_async(chr_thread, params)) p.close() p.join() res = np.zeros((nr_sites, 2), dtype=np.uint8) for bpath in [pr.get() for pr in processes]: res += load_beta_data(bpath) os.remove(bpath) res.tofile(out_beta) return out_beta
class AddCpGsToBed: def __init__(self, args): self.args = args self.out_path = args.out_path self.debug = args.debug if not delete_or_skip(self.out_path, self.args.force): return # load bed file: self.df = load_bed(args.bed_path, 100000 if self.debug else None) self.genome = GenomeRefPaths(args.genome) # load chromosomes sizes (in GpGs): self.cf = self.genome.get_chrom_cpg_size_table() self.cf['size'] = np.cumsum(self.cf['size']) self.proc_bed() def proc_bed(self): processes = [] with Pool(self.args.threads) as p: chroms = [x for x in self.cf.chr if x in self.df.chr.unique()] for chrom in sorted(chroms): params = (self.df[self.df.chr == chrom], chrom, self.cf) processes.append(p.apply_async(chr_thread, params)) p.close() p.join() r = pd.concat([pr.get() for pr in processes]) if self.out_path is None: self.out_path = sys.stdout # r = self.reorder_to_original(r) r.to_csv(self.out_path, sep='\t', header=None, index=None, na_rep='NaN', mode='a') def reorder_to_original(self, res): return self.df[COORDS_COLS].merge(res, how='left', on=COORDS_COLS)
def pat2beta(pat_path, out_dir, args, force=True): validate_single_file(pat_path) if pat_path.endswith('.pat.gz'): cmd = 'gunzip -cd' elif pat_path.endswith('.pat'): cmd = 'cat' else: raise IllegalArgumentError('Invalid pat suffix: {}'.format(pat_path)) out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + '.beta') if not delete_or_skip(out_beta, force): return nr_sites = GenomeRefPaths(args.genome).nr_sites if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile( pat_path + '.csi'): return mult_pat2beta(pat_path, out_beta, nr_sites, args) cmd += ' {} | {} {} {}'.format(pat_path, PAT2BETA_TOOL, out_beta, nr_sites) subprocess.check_call(cmd, shell=True) return out_beta
def bedtools_conversion(bed_file, genome, drop_empty, add_anno, debug): df = load_bed(bed_file) tmp_name = tempfile.NamedTemporaryFile().name df.sort_values(by=['chr', 'start', 'end']).iloc[:, :3].\ drop_duplicates().to_csv(tmp_name, sep='\t', header=None, index=None) ref = GenomeRefPaths(genome).dict_path cmd = f"tabix -R {tmp_name} {ref} | " cmd += "awk -v OFS='\t' '{print $1,$2,$2+1,$3}' | " cmd += " sort -k1,1 -k2,2n -u | " # sort is required for cases of overlapping blocks cmd += f"bedtools intersect -sorted -b - -a {tmp_name} -loj | " cmd += f"bedtools groupby -g 1,2,3 -c 7,7 -o first,last | " cmd += " awk -v OFS='\t' '{print $1,$2,$3,$4,$5+1;}' " cmd += "| sed 's/\.\t1/NA\tNA/g'" # replace missing values with (NA)s if debug: eprint(cmd.replace('\t', '\\t')) rf = read_shell(cmd, names=COORDS_COLS5) if not debug: os.unlink(tmp_name) # if there are missing values, the CpG columns' type # will be float or object. Change it to Int64 if rf.empty: raise IllegalArgumentError( '[wt convert] Error: failed with bedtools wrapping') if rf['startCpG'].dtype != int: rf = rf.astype({'startCpG': 'Int64', 'endCpG': 'Int64'}) df = df.merge(rf, how='left', on=COORDS_COLS3) df = df[COORDS_COLS5 + list(df.columns)[3:-2]] if drop_empty: df.dropna(inplace=True, subset=['startCpG', 'endCpG']) # add annotations: if add_anno: df = get_anno(df, genome, bed_file) return df
def test_multiple_regions(tabixed_bed_file, pat_file, num_threads, out_file, is_strict, min_len, verbose): peek_df = pd.read_csv(tabixed_bed_file, sep='\t', nrows=1, header=None, comment='#') names = COORDS_COLS5 if len(peek_df.columns) < len(names): msg = f'Invalid blocks file: {blocks_path}. less than {len(names)} columns.\n' msg += f'Run wgbstools convert -L {blocks_path} -o OUTPUT_REGION_FILE to add the CpG columns' raise IllegalArgumentError(msg) chroms = GenomeRefPaths().get_chroms() params_list = ((tabixed_bed_file, c, pat_file, is_strict, min_len, verbose) for c in chroms) p = Pool(num_threads) arr = p.starmap(read_blocks_and_test, params_list) p.close() p.join() region_p_val_list = [p_reg for x in arr for p_reg in x] # flatten if not region_p_val_list: if verbose: eprint(f'[wt bimodal] empty list') return region_p_val_list = sorted(region_p_val_list, key=lambda elem: elem[1]) [block_lines, p_vals] = zip(*region_p_val_list) accepted_blocks, corrected_p_vals = choose_blocks_by_fdr_bh( p_vals, block_lines) with open(out_file, "w") if out_file != "-" else sys.stdout as f_out: for accepted_block, corrected_p_val in zip(accepted_blocks, corrected_p_vals): f_out.write(f"{accepted_block}\t{corrected_p_val:,.1e}\n")
class GenomicRegion: def __init__(self, args=None, region=None, name='hg19'): self.genome_name = name self.chrom = None self.sites = None self.region_str = None self.bp_tuple = None self.chrs_sz = None # DataFrame of chromosomes sizes (in number of sites) self.name = name self.args = args # todo: this could be prettier if args is not None: self.name = args.genome self.genome = GenomeRefPaths(self.name) if args.sites: self.parse_sites(args.sites) elif args.region: self.parse_region(args.region) elif region is not None: self.genome = GenomeRefPaths(self.name) self.parse_region(region) else: raise IllegalArgumentError('Invalid GR init {}'.format(region)) self.nr_sites = None if self.sites is None else self.sites[ 1] - self.sites[0] self.annotation = self.add_anno() def add_anno(self): if self.args is None or self.is_whole(): return if self.args.no_anno: return anno_path = self.genome.annotations if anno_path is None: return try: cmd = 'tabix {} {} | cut -f4- | uniq'.format( anno_path, self.region_str) res = subprocess.check_output(cmd, shell=True).decode().strip() return res except subprocess.CalledProcessError: eprint('Failed to retrieve annotation for reagion ', self.region_str) return def parse_sites(self, sites_str): """ Parse input of the type -s / --sites (e.g 15-25) """ # Parse sites string: s1, s2 = self._sites_str_to_tuple(sites_str) # Translate sites indexes to genomic loci: self.chrom, region_from = self.index2locus(s1) chrom2, region_to = self.index2locus(s2 - 1) # non-inclusive region_to += 1 # include the hole last site (C and G) if self.chrom != chrom2: eprint('ERROR: sites range cross chromosomes! ({}, {})'.format( s1, s2)) raise IllegalArgumentError('Invalid sites input') # Update GR fields: self.sites = (s1, s2) self.region_str = "{}:{}-{}".format(self.chrom, region_from, region_to) self.bp_tuple = (region_from, region_to) def _chrome_size(self): df = pd.read_csv(self.genome.chrom_sizes, sep='\t', header=None, names=['chr', 'size']) return int(df[df['chr'] == self.chrom]['size']) def parse_region(self, region): """ Parse input of the type -r / --region (e.g chr11:200-300) """ region = region.replace(',', '') # remove commas chrome_match = re.match(r'^chr([\d]+|[XYM])$', region) region_match = re.match(r'chr([\d]+|[XYM]):([\d]+)-([\d]+)', region) # In case region is a whole chromosome if chrome_match: self.chrom = 'chr' + chrome_match.group(1) region_from = 1 region_to = self._chrome_size() # match region string to format chrom:from-to elif region_match: self.chrom = 'chr' + region_match.group(1) region_from = int(region_match.group(2)) region_to = int(region_match.group(3)) if region_to <= region_from: raise IllegalArgumentError( 'Invalid genomic region: {}. end before start'.format( region)) if region_to > self._chrome_size() or region_from < 1: raise IllegalArgumentError( 'Invalid genomic region: {}. Out of range'.format(region)) else: raise IllegalArgumentError( 'Invalid genomic region: {}'.format(region)) # Update GR fields: self.region_str = region self.sites = self._region_str2sites() self.bp_tuple = (region_from, region_to) def _region_str2sites(self): # find CpG indexes in range of the region: # todo: find start and end separately (in case they are far apart) cmd = 'tabix {} {} | '.format(self.genome.dict_path, self.region_str) # cmd += 'awk \'{if (NR == 1) {first=substr($4,4)}}END{print first"-"substr($4,4)}\'' cmd += 'awk \'{if (NR == 1) {first=$3}}END{print first"-"$3+1}\'' # eprint(cmd) res = subprocess.check_output(cmd, shell=True).decode() # throw error if there are no CpGs in range if res.strip() == '-1': raise IllegalArgumentError( 'Invalid genomic region: {}. No CpGs in range'.format( self.region_str)) s1, s2 = self._sites_str_to_tuple(res) # s2 += 1 # non-inclusive return s1, s2 def _sites_str_to_tuple(self, sites_str): """ extract integers tuple (e.g (120, 130)) from a sites string (e.g '120-130') """ if sites_str: sites_str = sites_str.replace(',', '') matchObj = re.match(r'([\d]+)-([\d]+)', sites_str) if matchObj: site1 = int(matchObj.group(1)) site2 = int(matchObj.group(2)) if not self.genome.nr_sites + 1 >= site2 > site1 >= 1: msg = 'sites violate the constraints: ' msg += '{} >= {} > {} >= 1'.format( self.genome.nr_sites + 1, site2, site1) raise IllegalArgumentError(msg) return site1, site2 raise IllegalArgumentError( 'sites must be of format: ([\d])-([\d]).\nGot: {}'.format( sites_str)) def index2chrom(self, site): if self.chrs_sz is None: self.chrs_sz = self.genome.get_chrom_cpg_size_table() self.chrs_sz['borders'] = np.cumsum(self.chrs_sz['size']) cind = np.searchsorted( np.array(self.chrs_sz['borders']).flatten(), site) return self.chrs_sz['chr'].loc[cind] def index2locus(self, index): """ translate CpG index to genomic locus. e.g, CpG1 -> (chr1, 10469) :param index: a site index in range [1, NR_SITES] :return: chromosome, locus """ index = int(index) # validate input if not self.genome.nr_sites + 1 >= index >= 1: print('Invalid site index:', index) raise IllegalArgumentError('Out of range site index:', index) # find locus: with open(self.genome.revdict_path, 'rb') as f: f.seek((index - 1) * 4) loc = np.fromfile(f, dtype=np.int32, count=1)[0] - 1 return self.index2chrom(index), loc def __str__(self): if self.sites is None: return 'Whole genome' s1, s2 = self.sites f, t = self.bp_tuple # res = '{} ({:,} sites, {:,} bp, sites {}-{})'.format(self.region_str, s2 - s1, t - f + 1, s1, s2) res = '{} - {:,}bp, {:,}CpGs: {}-{}'.format(self.region_str, t - f + 1, s2 - s1, s1, s2) if self.annotation: res += '\n' + self.annotation return res def is_whole(self): """ :return: True iff no filters (-r, -s) were applied. i.e, this gr is the whole genome. """ return self.sites is None
class GenomicRegion: def __init__(self, args=None, region=None, sites=None, genome_name=None): self.genome_name = get_genome_name(genome_name) self.chrom = None self.sites = sites self.region_str = region self.bp_tuple = None self.args = args # todo: this could be prettier if args is not None: self.genome_name = get_genome_name(args.genome) self.genome = GenomeRefPaths(self.genome_name) if args.sites: self.parse_sites(args.sites) elif args.region: self.parse_region(args.region) elif region is not None: self.genome = GenomeRefPaths(self.genome_name) self.parse_region(region) elif sites is not None: self.genome = GenomeRefPaths(self.genome_name) self.parse_sites(sites) else: raise IllegalArgumentError(f'Invalid GR init {region}') self.nr_sites = None if self.sites is None else self.sites[ 1] - self.sites[0] self.annotation = self.add_anno() def add_anno(self): if self.args is None or self.is_whole() or 'no_anno' not in self.args: return elif self.args.no_anno: return anno_path = self.genome.annotations if anno_path is None: return try: cmd = f'tabix {anno_path} {self.region_str} | cut -f4- | uniq' return subprocess.check_output(cmd, shell=True).decode().strip() except subprocess.CalledProcessError: eprint( f'Failed to retrieve annotation for reagion {self.region_str}') def parse_sites(self, sites_str): """ Parse input of the type -s / --sites (e.g 15-25) """ # Parse sites string: s1, s2 = self._sites_str_to_tuple(sites_str) # Translate sites indexes to genomic loci: self.chrom, region_from = self.index2locus(s1) chrom2, region_to = self.index2locus(s2 - 1) # non-inclusive region_to += 1 # include the whole last site (C and G) if self.chrom != chrom2: eprint(f'ERROR: sites range cross chromosomes! ({s1}, {s2})') raise IllegalArgumentError('Invalid sites input') # Update GR fields: self.sites = (s1, s2) self.region_str = f'{self.chrom}:{region_from}-{region_to}' self.bp_tuple = (region_from, region_to) def _chrome_size(self): df = self.genome.get_chrom_size_table() return int(df[df['chr'] == self.chrom]['size']) def find_region_format(self, region): region = region.replace(',', '') # remove commas # In case region is a whole chromosome chrome_match = re.match(r'^(chr)?([\d]+|[XYM]|(MT))$', region) if chrome_match: if region not in self.genome.get_chroms(): raise IllegalArgumentError(f'Unknown chromosome: {region}') self.chrom = region return region, 1, self._chrome_size() # match region string to format chrom:from uni_region_match = re.match(r'^(chr)?([\d]+|[XYM]|(MT)):([\d]+)$', region) if uni_region_match: region_from = uni_region_match.group(4) region += f'-{int(region_from) + 1}' # match region string to format chrom:from-to region_match = re.match( r'^((chr)?([\d]+|[XYM]|(MT))):([\d]+)-([\d]+)$', region) if not region_match: raise IllegalArgumentError(f'Invalid genomic region: {region}') self.chrom = region_match.group(1) if self.chrom not in self.genome.get_chroms(): raise IllegalArgumentError(f'Unknown chromosome: {region}') region_from = int(region_match.group(5)) region_to = int(region_match.group(6)) return region, region_from, region_to def parse_region(self, region): """ Parse input of the type -r / --region (e.g chr11:200-300) """ self.region_str, region_from, region_to = self.find_region_format( region) # validate region range: if region_to <= region_from: raise IllegalArgumentError( f'Invalid genomic region: {region}. end before start') if region_to > self._chrome_size() or region_from < 1: raise IllegalArgumentError( f'Invalid genomic region: {region}. Out of range') # Update GR fields: self.bp_tuple = (region_from, region_to) self.sites = self._region_str2sites() def _region_str2sites(self): # find CpG indexes in range of the region: cmd = f'tabix {self.genome.dict_path} {self.region_str} | ' # cmd += 'awk \'(NR==1){first=$3} {lbp=$2} END{print first"-"$3+1}\'' # if bp_tuple[1] equals exactly a loci of a CpG site, this site is *not* included # e.g., in hg19, chr6:71046415-71046562 is 9718430-9718435 in sites cmd += f"awk -v b={self.bp_tuple[1]} " cmd += '\'(NR==1){first=$3} END{if ($2<b) {r+=1}; print first"-"$3+r}\'' res = subprocess.check_output(cmd, shell=True).decode() # eprint(cmd) if len(set(res.strip().split('-'))) == 1: res = '-1' # throw error if there are no CpGs in range if res.strip() == '-1': raise IllegalArgumentError( f'Invalid genomic region: {self.region_str}. No CpGs in range') s1, s2 = self._sites_str_to_tuple(res) # s2 += 1 # non-inclusive return s1, s2 def _sites_str_to_tuple(self, sites_str): """ extract integers tuple (e.g (120, 130)) from a sites string (e.g '120-130') """ if not sites_str: raise IllegalArgumentError(f'Empty sites string: {sites_str}') sites_str = sites_str.replace(',', '') # start-end syntax matchObj = re.match(r'([\d]+)-([\d]+)', sites_str) if matchObj: site1 = int(matchObj.group(1)) site2 = int(matchObj.group(2)) # single site syntax: elif '-' not in sites_str and sites_str.isdigit(): site1 = int(sites_str) site2 = site1 + 1 else: raise IllegalArgumentError( f'sites must be of format: "start-end" or "site" .\nGot: {sites_str}' ) # validate sites are in range: if not self.genome.get_nr_sites() + 1 >= site2 >= site1 >= 1: msg = 'sites violate the constraints: ' msg += f'{self.genome.get_nr_sites() + 1} >= {site2} > {site1} >= 1' raise IllegalArgumentError(msg) if site1 == site2: site2 += 1 return site1, site2 def index2locus(self, index): """ translate CpG index to genomic locus. e.g, CpG1 -> (chr1, 10469) :param index: a site index in range [1, NR_SITES] :return: chromosome, locus """ index = int(index) # validate input if not self.genome.get_nr_sites() + 1 >= index >= 1: eprint('Invalid site index:', index) raise IllegalArgumentError('Out of range site index:', index) # find chromosome: chrom = index2chrom(index, self.genome) # find locus: cmd = f'tabix {self.genome.revdict_path} {chrom}:{index}-{index} | cut -f2' try: loc = int( subprocess.check_output(cmd, shell=True).decode().strip()) except ValueError as e: msg = f'Failed retrieving locus for site {index} with command:\n{cmd}\n{e}' raise IllegalArgumentError(msg) return chrom, loc def __str__(self): if self.sites is None: return 'Whole genome' s1, s2 = self.sites nr_bp = np.diff(self.bp_tuple)[0] + 1 res = f'{self.region_str} - {nr_bp:,}bp, {s2 - s1:,}CpGs: {s1}-{s2}' if self.annotation: res += '\n' + self.annotation return res def is_whole(self): """ True iff no filters (-r, -s) were applied. i.e, this gr is the whole genome.""" return self.sites is None
class SegmentByChunks: def __init__(self, args, betas): self.betas = betas max_cpg = min(args.max_cpg, args.max_bp // 2) assert (max_cpg > 1) self.genome = GenomeRefPaths(args.genome) self.param_dict = { 'betas': betas, 'pcount': args.pcount, 'max_cpg': max_cpg, 'max_bp': args.max_bp, 'revdict': self.genome.revdict_path, 'genome': self.genome } self.args = args def break_to_chunks(self): """ Break range of sites to chunks of size 'step', while keeping chromosomes separated """ # print a warning in case chunk size is too small step = self.args.chunk_size if step < self.args.max_cpg: msg = '[wt segment] WARNING: chunk_size is small compared to max_cpg and/or max_bp.\n' \ ' It may cause wt segment to fail. It\'s best setting\n' \ ' chunk_size > min{max_cpg, max_bp/2}' eprint(msg) if self.args.bed_file: df = load_blocks_file(self.args.bed_file)[['startCpG', 'endCpG']].dropna() # make sure bed file has no overlaps or duplicated regions is_nice, msg = is_block_file_nice(df) if not is_nice: msg = '[wt segment] ERROR: invalid bed file.\n' \ f' {msg}\n' \ f' Try: sort -k1,1 -k2,2n {self.args.bed_file} | ' \ 'bedtools merge -i - | wgbstools convert --drop_empty -p -L -' eprint(msg) raise IllegalArgumentError('Invalid bed file') if df.shape[0] > 2 * 1e4: msg = '[wt segment] WARNING: bed file contains many regions.\n' \ ' Segmentation will take a long time.\n' \ f' Consider running w/o -L flag and intersect the results\n' eprint(msg) else: # No bed file provided gr = GenomicRegion(self.args) # whole genome - make a dummy "bed file" of the full chromosomes if gr.is_whole(): cf = self.genome.get_chrom_cpg_size_table() cf['endCpG'] = np.cumsum(cf['size']) + 1 cf['startCpG'] = cf['endCpG'] - cf['size'] df = cf[['startCpG', 'endCpG']] # one region else: df = pd.DataFrame(columns=['startCpG', 'endCpG'], data=[gr.sites]) # build a DataFrame of chunks, with a "tag"/label field, # so we know which chunks to merge later on. rf = pd.DataFrame() tags = [] starts = [] ends = [] for ind, row in df.iterrows(): start, end = row bords = list(range(start, end, step)) + [end] tags += [f'{start}-{end}'] * (len(bords) - 1) starts += bords[:-1] ends += bords[1:] return tags, starts, ends def run(self): # break input region/s to small chunks tags, starts, ends = self.break_to_chunks() # segment each chunk separately in a single thread p = Pool(self.args.threads) params = [(dict(self.param_dict, **{'sites': (s, e)}), ) for s, e in zip(starts, ends)] arr = p.starmap(segment_process, params) p.close() p.join() # merge chunks from the same "tag" group # (i.e. the same chromosome, or the same region of the provided bed file) df = pd.DataFrame() for tag in set(tags): carr = [arr[i] for i in range(len(arr)) if tags[i] == tag] merged = self.merge_df_list(carr) df = pd.concat([ df, pd.DataFrame({ 'startCpG': merged[:-1], 'endCpG': merged[1:] }) ]) self.dump_result(df.reset_index(drop=True)) def merge_df_list(self, dflist): # Given a set of chunks to merge, recursively pairwise stich them. while len(dflist) > 1: p = Pool(self.args.threads) params = [(dflist[i - 1], dflist[i], self.param_dict) for i in range(1, len(dflist), 2)] arr = p.starmap(stitch_2_dfs, params) p.close() p.join() last_df = [dflist[-1]] if len(dflist) % 2 else [] dflist = arr + last_df return dflist[0] def dump_result(self, df): if df.empty: eprint('Empty blocks array') return # sort by startCpG and filter by CpGs nr_blocks = df.shape[0] df.sort_values(by=['startCpG'], inplace=True) df = df[df.endCpG - df.startCpG > self.args.min_cpg - 1].reset_index( drop=True) # verbose nr_blocks_filt = df.shape[0] nr_dropped = nr_blocks - nr_blocks_filt eprint(f'[wt segment] found {nr_blocks_filt:,} blocks\n' \ f' (dropped {nr_dropped:,} short blocks)') # add genomic loci and dump/print temp_path = next(tempfile._get_candidate_names()) try: df.to_csv(temp_path, sep='\t', header=None, index=None) add_bed_to_cpgs(temp_path, self.genome.genome, self.args.out_path) finally: if op.isfile(temp_path): os.remove(temp_path)