Esempio n. 1
0
    def __init__(self, args=None, region=None, name='hg19'):
        self.genome_name = name
        self.chrom = None
        self.sites = None
        self.region_str = None
        self.bp_tuple = None
        self.chrs_sz = None  # DataFrame of chromosomes sizes (in number of sites)
        self.name = name
        self.args = args

        # todo: this could be prettier
        if args is not None:
            self.name = args.genome
            self.genome = GenomeRefPaths(self.name)
            if args.sites:
                self.parse_sites(args.sites)
            elif args.region:
                self.parse_region(args.region)
        elif region is not None:
            self.genome = GenomeRefPaths(self.name)
            self.parse_region(region)
        else:
            raise IllegalArgumentError('Invalid GR init {}'.format(region))

        self.nr_sites = None if self.sites is None else self.sites[
            1] - self.sites[0]
        self.annotation = self.add_anno()
Esempio n. 2
0
 def __init__(self, args, betas):
     self.betas = betas
     max_cpg = min(args.max_cpg, args.max_bp // 2)
     assert (max_cpg > 1)
     self.genome = GenomeRefPaths(args.genome)
     self.param_dict = {
         'betas': betas,
         'pcount': args.pcount,
         'max_cpg': max_cpg,
         'max_bp': args.max_bp,
         'revdict': self.genome.revdict_path,
         'genome': self.genome
     }
     self.args = args
Esempio n. 3
0
 def set_lists(self):
     # black/white lists:
     blacklist = self.args.blacklist
     whitelist = self.args.whitelist
     if blacklist == True:
         blacklist = GenomeRefPaths(self.args.genome).blacklist
     elif whitelist == True:
         whitelist = GenomeRefPaths(self.args.genome).whitelist
     if blacklist:
         validate_single_file(blacklist)
     elif whitelist:
         validate_single_file(whitelist)
     if self.verbose:
         eprint(f'[wt bam2pat] blacklist: {blacklist}')
         eprint(f'[wt bam2pat] whitelist: {whitelist}')
     return blacklist, whitelist
Esempio n. 4
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}')

    suff = '.lbeta' if args.lbeta else '.beta'
    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff)
    if not delete_or_skip(out_beta, force):
        return

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        arr = mult_pat2beta(pat_path, args)
    else:
        nr_sites = GenomeRefPaths(args.genome).get_nr_sites()
        cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}'
        x = subprocess.check_output(cmd, shell=True).decode()
        arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2))

    trim_to_uint8(arr, args.lbeta).tofile(out_beta)
    return out_beta
Esempio n. 5
0
    def __init__(self, args):
        self.args = args
        self.out_path = args.out_path
        self.debug = args.debug
        if not delete_or_skip(self.out_path, self.args.force):
            return

        # load bed file:
        self.df = load_bed(args.bed_path, 100000 if self.debug else None)

        self.genome = GenomeRefPaths(args.genome)

        # load chromosomes sizes (in GpGs):
        self.cf = self.genome.get_chrom_cpg_size_table()
        self.cf['size'] = np.cumsum(self.cf['size'])
        self.proc_bed()
Esempio n. 6
0
def add_cpgs_to_bed(bed_file, genome, drop_empty, threads, add_anno):
    # load bed file:
    df = load_bed(bed_file)

    # load chromosomes sizes (in GpGs):
    cf = GenomeRefPaths(genome).get_chrom_cpg_size_table()
    cf['size'] = np.cumsum(cf['size'])

    chroms = sorted(set(cf.chr) & set(df.chr))
    params = [(df[df.chr == chrom].copy(), cf, genome) for chrom in chroms]
    p = Pool(threads)
    arr = p.starmap(chr_thread, params)
    p.close()
    p.join()

    # concat chromosomes
    r = pd.concat(arr)
    # merge with original table, to keep the order and the empty regions
    r = df.merge(r[COORDS_COLS5], how='left', on=COORDS_COLS3)
    r = r[COORDS_COLS5 + list(r.columns)[3:-2]]

    # add annotations:
    if add_anno:
        r = get_anno(r, genome, bed_file)

    # drop regions w/o CpGs
    if drop_empty:
        r.dropna(inplace=True, subset=['startCpG', 'endCpG'])

    return r
Esempio n. 7
0
 def load_blocks_file(self):
     if self.verbose:
         eprint('loading blocks...')
     names = ['chr', 'start', 'end', 'startCpG', 'endCpG']
     cols = range(len(names))
     nrows = DEBUG_NR if self.args.debug else None
     blocks_path = self.args.blocks_path
     if blocks_path is None:
         blocks_path = GenomeRefPaths().blocks
     df = pd.read_csv(self.args.blocks_path,
                      sep='\t',
                      header=None,
                      names=names,
                      nrows=nrows,
                      usecols=cols)
     df['lenCpG'] = df['endCpG'] - df['startCpG']
     self.keepinds = df['lenCpG'] >= self.args.min_cpg
     self.orig_nr_blocks = df.shape[0]
     df = df[self.keepinds].reset_index(drop=True)
     self.nr_blocks = df.shape[0]
     if self.verbose:
         eprint(f'loaded {self.orig_nr_blocks:,}')
         if self.nr_blocks != self.orig_nr_blocks:
             eprint(
                 f'droppd to {self.nr_blocks:,} with >={self.args.min_cpg} CpGs'
             )
     return df
Esempio n. 8
0
def add_bed_to_cpgs(site_file, genome, out_path=None):
    validate_local_exe(add_loci_tool)
    g = GenomeRefPaths(genome)
    cmd = f'cat {site_file} | {add_loci_tool} {g.dict_path} {g.chrom_cpg_sizes}'
    if (out_path is not None) and out_path != sys.stdout:
        cmd += f' > {out_path}'
    subprocess.check_call(cmd, shell=True)
Esempio n. 9
0
 def __init__(self, args):
     self.args = args
     self.gr = GenomicRegion(args)
     self.outdir = args.outdir
     self.name = ''
     if not op.isdir(self.outdir):
         raise IllegalArgumentError('Invalid output directory: ' +
                                    self.outdir)
     self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes
Esempio n. 10
0
    def __init__(self, args):
        self.args = args
        self.gr = GenomicRegion(args)
        self.debug = args.debug
        self.outdir = args.outdir
        if not op.isdir(self.outdir):
            raise IllegalArgumentError('Invalid output directory: ' +
                                       self.outdir)

        self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes
        self.ref_dict = self.load_dict()
Esempio n. 11
0
    def __init__(self, args=None, region=None, sites=None, genome_name=None):
        self.genome_name = get_genome_name(genome_name)
        self.chrom = None
        self.sites = sites
        self.region_str = region
        self.bp_tuple = None
        self.args = args

        # todo: this could be prettier
        if args is not None:
            self.genome_name = get_genome_name(args.genome)
            self.genome = GenomeRefPaths(self.genome_name)
            if args.sites:
                self.parse_sites(args.sites)
            elif args.region:
                self.parse_region(args.region)
        elif region is not None:
            self.genome = GenomeRefPaths(self.genome_name)
            self.parse_region(region)
        elif sites is not None:
            self.genome = GenomeRefPaths(self.genome_name)
            self.parse_sites(sites)
        else:
            raise IllegalArgumentError(f'Invalid GR init {region}')

        self.nr_sites = None if self.sites is None else self.sites[
            1] - self.sites[0]
        self.annotation = self.add_anno()
Esempio n. 12
0
def get_anno(df, genome, bed_file):
    try:
        anno_path = GenomeRefPaths(genome).annotations
        if anno_path is None:
            return df
        cmd = f'cut -f1-3 {bed_file} | sort -k1,1 -k2,2n -u | '
        cmd += f'bedtools intersect -a - -b {anno_path} -wao | '
        cmd += f'bedtools merge -i - -c 7,8 -o distinct,distinct'
        names = COORDS_COLS3 + ['type', 'gene']
        rf = read_shell(cmd, names=names)
        return df.merge(rf, how='left', on=COORDS_COLS3)
    except Exception as e:
        eprint(f'[wt convert] WARNING: No annotations added.')
        eprint(e)
        return df
Esempio n. 13
0
def mult_pat2beta(pat_path, args):
    processes = []
    with Pool(args.threads) as p:
        ct = GenomeRefPaths(args.genome).get_chrom_cpg_size_table()
        x = np.cumsum([0] + list(ct['size'])) + 1
        chroms = list(ct['chr'])
        for i, chrom in enumerate(chroms):
            start = x[i]
            end = x[i + 1]
            params = (pat_path, chrom, start, end)
            processes.append(p.apply_async(chr_thread, params))
        p.close()
        p.join()

    beta_files = [pr.get() for pr in processes]
    res = np.concatenate(beta_files, axis=0)
    return res
Esempio n. 14
0
def mult_pat2beta(pat_path, out_beta, nr_sites, args):
    processes = []

    with Pool(args.threads) as p:
        chroms = list(
            GenomeRefPaths(args.genome).get_chrom_cpg_size_table()['chr'])
        for chrom in sorted(chroms):
            beta = '{}.{}.beta'.format(op.splitext(out_beta)[0], chrom)
            params = (chrom, pat_path, beta, nr_sites)
            processes.append(p.apply_async(chr_thread, params))
        p.close()
        p.join()

    res = np.zeros((nr_sites, 2), dtype=np.uint8)
    for bpath in [pr.get() for pr in processes]:
        res += load_beta_data(bpath)
        os.remove(bpath)
    res.tofile(out_beta)
    return out_beta
Esempio n. 15
0
class AddCpGsToBed:
    def __init__(self, args):
        self.args = args
        self.out_path = args.out_path
        self.debug = args.debug
        if not delete_or_skip(self.out_path, self.args.force):
            return

        # load bed file:
        self.df = load_bed(args.bed_path, 100000 if self.debug else None)

        self.genome = GenomeRefPaths(args.genome)

        # load chromosomes sizes (in GpGs):
        self.cf = self.genome.get_chrom_cpg_size_table()
        self.cf['size'] = np.cumsum(self.cf['size'])
        self.proc_bed()

    def proc_bed(self):

        processes = []
        with Pool(self.args.threads) as p:
            chroms = [x for x in self.cf.chr if x in self.df.chr.unique()]
            for chrom in sorted(chroms):
                params = (self.df[self.df.chr == chrom], chrom, self.cf)
                processes.append(p.apply_async(chr_thread, params))
            p.close()
            p.join()

        r = pd.concat([pr.get() for pr in processes])
        if self.out_path is None:
            self.out_path = sys.stdout
        # r = self.reorder_to_original(r)
        r.to_csv(self.out_path,
                 sep='\t',
                 header=None,
                 index=None,
                 na_rep='NaN',
                 mode='a')

    def reorder_to_original(self, res):
        return self.df[COORDS_COLS].merge(res, how='left', on=COORDS_COLS)
Esempio n. 16
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError('Invalid pat suffix: {}'.format(pat_path))

    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + '.beta')
    if not delete_or_skip(out_beta, force):
        return
    nr_sites = GenomeRefPaths(args.genome).nr_sites

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        return mult_pat2beta(pat_path, out_beta, nr_sites, args)

    cmd += ' {} | {} {} {}'.format(pat_path, PAT2BETA_TOOL, out_beta, nr_sites)
    subprocess.check_call(cmd, shell=True)
    return out_beta
Esempio n. 17
0
def bedtools_conversion(bed_file, genome, drop_empty, add_anno, debug):
    df = load_bed(bed_file)
    tmp_name = tempfile.NamedTemporaryFile().name
    df.sort_values(by=['chr', 'start', 'end']).iloc[:, :3].\
        drop_duplicates().to_csv(tmp_name, sep='\t', header=None, index=None)
    ref = GenomeRefPaths(genome).dict_path
    cmd = f"tabix -R {tmp_name} {ref} | "
    cmd += "awk -v OFS='\t' '{print $1,$2,$2+1,$3}' | "
    cmd += " sort -k1,1 -k2,2n -u | "  # sort is required for cases of overlapping blocks
    cmd += f"bedtools intersect -sorted -b - -a {tmp_name} -loj | "
    cmd += f"bedtools groupby -g 1,2,3 -c 7,7 -o first,last | "
    cmd += " awk -v OFS='\t' '{print $1,$2,$3,$4,$5+1;}' "
    cmd += "| sed 's/\.\t1/NA\tNA/g'"  # replace missing values with (NA)s
    if debug:
        eprint(cmd.replace('\t', '\\t'))
    rf = read_shell(cmd, names=COORDS_COLS5)
    if not debug:
        os.unlink(tmp_name)

    # if there are missing values, the CpG columns' type
    # will be float or object. Change it to Int64
    if rf.empty:
        raise IllegalArgumentError(
            '[wt convert] Error: failed with bedtools wrapping')

    if rf['startCpG'].dtype != int:
        rf = rf.astype({'startCpG': 'Int64', 'endCpG': 'Int64'})

    df = df.merge(rf, how='left', on=COORDS_COLS3)
    df = df[COORDS_COLS5 + list(df.columns)[3:-2]]

    if drop_empty:
        df.dropna(inplace=True, subset=['startCpG', 'endCpG'])

    # add annotations:
    if add_anno:
        df = get_anno(df, genome, bed_file)

    return df
Esempio n. 18
0
def test_multiple_regions(tabixed_bed_file, pat_file, num_threads, out_file,
                          is_strict, min_len, verbose):

    peek_df = pd.read_csv(tabixed_bed_file,
                          sep='\t',
                          nrows=1,
                          header=None,
                          comment='#')
    names = COORDS_COLS5
    if len(peek_df.columns) < len(names):
        msg = f'Invalid blocks file: {blocks_path}. less than {len(names)} columns.\n'
        msg += f'Run wgbstools convert -L {blocks_path} -o OUTPUT_REGION_FILE to add the CpG columns'
        raise IllegalArgumentError(msg)

    chroms = GenomeRefPaths().get_chroms()
    params_list = ((tabixed_bed_file, c, pat_file, is_strict, min_len, verbose)
                   for c in chroms)

    p = Pool(num_threads)
    arr = p.starmap(read_blocks_and_test, params_list)
    p.close()
    p.join()

    region_p_val_list = [p_reg for x in arr for p_reg in x]  # flatten
    if not region_p_val_list:
        if verbose:
            eprint(f'[wt bimodal] empty list')
        return
    region_p_val_list = sorted(region_p_val_list, key=lambda elem: elem[1])
    [block_lines, p_vals] = zip(*region_p_val_list)
    accepted_blocks, corrected_p_vals = choose_blocks_by_fdr_bh(
        p_vals, block_lines)
    with open(out_file, "w") if out_file != "-" else sys.stdout as f_out:
        for accepted_block, corrected_p_val in zip(accepted_blocks,
                                                   corrected_p_vals):
            f_out.write(f"{accepted_block}\t{corrected_p_val:,.1e}\n")
Esempio n. 19
0
class GenomicRegion:
    def __init__(self, args=None, region=None, name='hg19'):
        self.genome_name = name
        self.chrom = None
        self.sites = None
        self.region_str = None
        self.bp_tuple = None
        self.chrs_sz = None  # DataFrame of chromosomes sizes (in number of sites)
        self.name = name
        self.args = args

        # todo: this could be prettier
        if args is not None:
            self.name = args.genome
            self.genome = GenomeRefPaths(self.name)
            if args.sites:
                self.parse_sites(args.sites)
            elif args.region:
                self.parse_region(args.region)
        elif region is not None:
            self.genome = GenomeRefPaths(self.name)
            self.parse_region(region)
        else:
            raise IllegalArgumentError('Invalid GR init {}'.format(region))

        self.nr_sites = None if self.sites is None else self.sites[
            1] - self.sites[0]
        self.annotation = self.add_anno()

    def add_anno(self):
        if self.args is None or self.is_whole():
            return
        if self.args.no_anno:
            return
        anno_path = self.genome.annotations
        if anno_path is None:
            return
        try:

            cmd = 'tabix {} {} | cut -f4- | uniq'.format(
                anno_path, self.region_str)
            res = subprocess.check_output(cmd, shell=True).decode().strip()
            return res
        except subprocess.CalledProcessError:
            eprint('Failed to retrieve annotation for reagion ',
                   self.region_str)
            return

    def parse_sites(self, sites_str):
        """ Parse input of the type -s / --sites (e.g 15-25) """

        # Parse sites string:
        s1, s2 = self._sites_str_to_tuple(sites_str)

        # Translate sites indexes to genomic loci:
        self.chrom, region_from = self.index2locus(s1)
        chrom2, region_to = self.index2locus(s2 - 1)  # non-inclusive
        region_to += 1  # include the hole last site (C and G)
        if self.chrom != chrom2:
            eprint('ERROR: sites range cross chromosomes! ({}, {})'.format(
                s1, s2))
            raise IllegalArgumentError('Invalid sites input')

        # Update GR fields:
        self.sites = (s1, s2)
        self.region_str = "{}:{}-{}".format(self.chrom, region_from, region_to)
        self.bp_tuple = (region_from, region_to)

    def _chrome_size(self):
        df = pd.read_csv(self.genome.chrom_sizes,
                         sep='\t',
                         header=None,
                         names=['chr', 'size'])
        return int(df[df['chr'] == self.chrom]['size'])

    def parse_region(self, region):
        """ Parse input of the type -r / --region (e.g chr11:200-300) """
        region = region.replace(',', '')  # remove commas
        chrome_match = re.match(r'^chr([\d]+|[XYM])$', region)
        region_match = re.match(r'chr([\d]+|[XYM]):([\d]+)-([\d]+)', region)

        # In case region is a whole chromosome
        if chrome_match:
            self.chrom = 'chr' + chrome_match.group(1)
            region_from = 1
            region_to = self._chrome_size()

        # match region string to format chrom:from-to
        elif region_match:
            self.chrom = 'chr' + region_match.group(1)
            region_from = int(region_match.group(2))
            region_to = int(region_match.group(3))
            if region_to <= region_from:
                raise IllegalArgumentError(
                    'Invalid genomic region: {}. end before start'.format(
                        region))
            if region_to > self._chrome_size() or region_from < 1:
                raise IllegalArgumentError(
                    'Invalid genomic region: {}. Out of range'.format(region))

        else:
            raise IllegalArgumentError(
                'Invalid genomic region: {}'.format(region))

        # Update GR fields:
        self.region_str = region
        self.sites = self._region_str2sites()
        self.bp_tuple = (region_from, region_to)

    def _region_str2sites(self):
        # find CpG indexes in range of the region:
        # todo: find start and end separately (in case they are far apart)
        cmd = 'tabix {} {} | '.format(self.genome.dict_path, self.region_str)
        # cmd += 'awk \'{if (NR == 1) {first=substr($4,4)}}END{print first"-"substr($4,4)}\''
        cmd += 'awk \'{if (NR == 1) {first=$3}}END{print first"-"$3+1}\''
        # eprint(cmd)
        res = subprocess.check_output(cmd, shell=True).decode()

        # throw error if there are no CpGs in range
        if res.strip() == '-1':
            raise IllegalArgumentError(
                'Invalid genomic region: {}. No CpGs in range'.format(
                    self.region_str))

        s1, s2 = self._sites_str_to_tuple(res)
        # s2 += 1     # non-inclusive
        return s1, s2

    def _sites_str_to_tuple(self, sites_str):
        """ extract integers tuple (e.g (120, 130)) from a sites string (e.g '120-130') """
        if sites_str:
            sites_str = sites_str.replace(',', '')
            matchObj = re.match(r'([\d]+)-([\d]+)', sites_str)
            if matchObj:
                site1 = int(matchObj.group(1))
                site2 = int(matchObj.group(2))
                if not self.genome.nr_sites + 1 >= site2 > site1 >= 1:
                    msg = 'sites violate the constraints: '
                    msg += '{} >= {} > {} >= 1'.format(
                        self.genome.nr_sites + 1, site2, site1)
                    raise IllegalArgumentError(msg)
                return site1, site2
        raise IllegalArgumentError(
            'sites must be of format: ([\d])-([\d]).\nGot: {}'.format(
                sites_str))

    def index2chrom(self, site):
        if self.chrs_sz is None:
            self.chrs_sz = self.genome.get_chrom_cpg_size_table()
            self.chrs_sz['borders'] = np.cumsum(self.chrs_sz['size'])
        cind = np.searchsorted(
            np.array(self.chrs_sz['borders']).flatten(), site)
        return self.chrs_sz['chr'].loc[cind]

    def index2locus(self, index):
        """
        translate CpG index to genomic locus. e.g, CpG1 -> (chr1, 10469)
        :param index: a site index in range [1, NR_SITES]
        :return: chromosome, locus
        """
        index = int(index)

        # validate input
        if not self.genome.nr_sites + 1 >= index >= 1:
            print('Invalid site index:', index)
            raise IllegalArgumentError('Out of range site index:', index)

        # find locus:
        with open(self.genome.revdict_path, 'rb') as f:
            f.seek((index - 1) * 4)
            loc = np.fromfile(f, dtype=np.int32, count=1)[0] - 1
            return self.index2chrom(index), loc

    def __str__(self):
        if self.sites is None:
            return 'Whole genome'
        s1, s2 = self.sites
        f, t = self.bp_tuple
        # res = '{} ({:,} sites, {:,} bp, sites {}-{})'.format(self.region_str, s2 - s1, t - f + 1, s1, s2)
        res = '{} - {:,}bp, {:,}CpGs: {}-{}'.format(self.region_str, t - f + 1,
                                                    s2 - s1, s1, s2)
        if self.annotation:
            res += '\n' + self.annotation
        return res

    def is_whole(self):
        """
        :return: True iff no filters (-r, -s) were applied. i.e, this gr is the whole genome.
        """
        return self.sites is None
Esempio n. 20
0
class GenomicRegion:
    def __init__(self, args=None, region=None, sites=None, genome_name=None):
        self.genome_name = get_genome_name(genome_name)
        self.chrom = None
        self.sites = sites
        self.region_str = region
        self.bp_tuple = None
        self.args = args

        # todo: this could be prettier
        if args is not None:
            self.genome_name = get_genome_name(args.genome)
            self.genome = GenomeRefPaths(self.genome_name)
            if args.sites:
                self.parse_sites(args.sites)
            elif args.region:
                self.parse_region(args.region)
        elif region is not None:
            self.genome = GenomeRefPaths(self.genome_name)
            self.parse_region(region)
        elif sites is not None:
            self.genome = GenomeRefPaths(self.genome_name)
            self.parse_sites(sites)
        else:
            raise IllegalArgumentError(f'Invalid GR init {region}')

        self.nr_sites = None if self.sites is None else self.sites[
            1] - self.sites[0]
        self.annotation = self.add_anno()

    def add_anno(self):
        if self.args is None or self.is_whole() or 'no_anno' not in self.args:
            return
        elif self.args.no_anno:
            return
        anno_path = self.genome.annotations
        if anno_path is None:
            return
        try:
            cmd = f'tabix {anno_path} {self.region_str} | cut -f4- | uniq'
            return subprocess.check_output(cmd, shell=True).decode().strip()
        except subprocess.CalledProcessError:
            eprint(
                f'Failed to retrieve annotation for reagion {self.region_str}')

    def parse_sites(self, sites_str):
        """ Parse input of the type -s / --sites (e.g 15-25) """

        # Parse sites string:
        s1, s2 = self._sites_str_to_tuple(sites_str)

        # Translate sites indexes to genomic loci:
        self.chrom, region_from = self.index2locus(s1)
        chrom2, region_to = self.index2locus(s2 - 1)  # non-inclusive
        region_to += 1  # include the whole last site (C and G)
        if self.chrom != chrom2:
            eprint(f'ERROR: sites range cross chromosomes! ({s1}, {s2})')
            raise IllegalArgumentError('Invalid sites input')

        # Update GR fields:
        self.sites = (s1, s2)
        self.region_str = f'{self.chrom}:{region_from}-{region_to}'
        self.bp_tuple = (region_from, region_to)

    def _chrome_size(self):
        df = self.genome.get_chrom_size_table()
        return int(df[df['chr'] == self.chrom]['size'])

    def find_region_format(self, region):
        region = region.replace(',', '')  # remove commas

        # In case region is a whole chromosome
        chrome_match = re.match(r'^(chr)?([\d]+|[XYM]|(MT))$', region)
        if chrome_match:
            if region not in self.genome.get_chroms():
                raise IllegalArgumentError(f'Unknown chromosome: {region}')
            self.chrom = region
            return region, 1, self._chrome_size()

        # match region string to format chrom:from
        uni_region_match = re.match(r'^(chr)?([\d]+|[XYM]|(MT)):([\d]+)$',
                                    region)
        if uni_region_match:
            region_from = uni_region_match.group(4)
            region += f'-{int(region_from) + 1}'

        # match region string to format chrom:from-to
        region_match = re.match(
            r'^((chr)?([\d]+|[XYM]|(MT))):([\d]+)-([\d]+)$', region)
        if not region_match:
            raise IllegalArgumentError(f'Invalid genomic region: {region}')

        self.chrom = region_match.group(1)
        if self.chrom not in self.genome.get_chroms():
            raise IllegalArgumentError(f'Unknown chromosome: {region}')
        region_from = int(region_match.group(5))
        region_to = int(region_match.group(6))

        return region, region_from, region_to

    def parse_region(self, region):
        """ Parse input of the type -r / --region (e.g chr11:200-300) """

        self.region_str, region_from, region_to = self.find_region_format(
            region)

        # validate region range:
        if region_to <= region_from:
            raise IllegalArgumentError(
                f'Invalid genomic region: {region}. end before start')
        if region_to > self._chrome_size() or region_from < 1:
            raise IllegalArgumentError(
                f'Invalid genomic region: {region}. Out of range')

        # Update GR fields:
        self.bp_tuple = (region_from, region_to)
        self.sites = self._region_str2sites()

    def _region_str2sites(self):
        # find CpG indexes in range of the region:
        cmd = f'tabix {self.genome.dict_path} {self.region_str} | '
        # cmd += 'awk \'(NR==1){first=$3} {lbp=$2} END{print first"-"$3+1}\''

        # if bp_tuple[1] equals exactly a loci of a CpG site, this site is *not* included
        # e.g., in hg19, chr6:71046415-71046562 is 9718430-9718435 in sites
        cmd += f"awk -v b={self.bp_tuple[1]} "
        cmd += '\'(NR==1){first=$3} END{if ($2<b) {r+=1}; print first"-"$3+r}\''
        res = subprocess.check_output(cmd, shell=True).decode()
        # eprint(cmd)

        if len(set(res.strip().split('-'))) == 1:
            res = '-1'

        # throw error if there are no CpGs in range
        if res.strip() == '-1':
            raise IllegalArgumentError(
                f'Invalid genomic region: {self.region_str}. No CpGs in range')

        s1, s2 = self._sites_str_to_tuple(res)
        # s2 += 1     # non-inclusive
        return s1, s2

    def _sites_str_to_tuple(self, sites_str):
        """ extract integers tuple (e.g (120, 130)) from a sites string (e.g '120-130') """
        if not sites_str:
            raise IllegalArgumentError(f'Empty sites string: {sites_str}')

        sites_str = sites_str.replace(',', '')
        # start-end syntax
        matchObj = re.match(r'([\d]+)-([\d]+)', sites_str)
        if matchObj:
            site1 = int(matchObj.group(1))
            site2 = int(matchObj.group(2))
        # single site syntax:
        elif '-' not in sites_str and sites_str.isdigit():
            site1 = int(sites_str)
            site2 = site1 + 1
        else:
            raise IllegalArgumentError(
                f'sites must be of format: "start-end" or "site" .\nGot: {sites_str}'
            )
        # validate sites are in range:
        if not self.genome.get_nr_sites() + 1 >= site2 >= site1 >= 1:
            msg = 'sites violate the constraints: '
            msg += f'{self.genome.get_nr_sites() + 1} >= {site2} > {site1} >= 1'
            raise IllegalArgumentError(msg)
        if site1 == site2:
            site2 += 1
        return site1, site2

    def index2locus(self, index):
        """
        translate CpG index to genomic locus. e.g, CpG1 -> (chr1, 10469)
        :param index: a site index in range [1, NR_SITES]
        :return: chromosome, locus
        """
        index = int(index)
        # validate input
        if not self.genome.get_nr_sites() + 1 >= index >= 1:
            eprint('Invalid site index:', index)
            raise IllegalArgumentError('Out of range site index:', index)

        # find chromosome:
        chrom = index2chrom(index, self.genome)
        # find locus:
        cmd = f'tabix {self.genome.revdict_path} {chrom}:{index}-{index} | cut -f2'
        try:
            loc = int(
                subprocess.check_output(cmd, shell=True).decode().strip())
        except ValueError as e:
            msg = f'Failed retrieving locus for site {index} with command:\n{cmd}\n{e}'
            raise IllegalArgumentError(msg)
        return chrom, loc

    def __str__(self):
        if self.sites is None:
            return 'Whole genome'
        s1, s2 = self.sites
        nr_bp = np.diff(self.bp_tuple)[0] + 1
        res = f'{self.region_str} - {nr_bp:,}bp, {s2 - s1:,}CpGs: {s1}-{s2}'
        if self.annotation:
            res += '\n' + self.annotation
        return res

    def is_whole(self):
        """ True iff no filters (-r, -s) were applied.
            i.e, this gr is the whole genome."""
        return self.sites is None
Esempio n. 21
0
class SegmentByChunks:
    def __init__(self, args, betas):
        self.betas = betas
        max_cpg = min(args.max_cpg, args.max_bp // 2)
        assert (max_cpg > 1)
        self.genome = GenomeRefPaths(args.genome)
        self.param_dict = {
            'betas': betas,
            'pcount': args.pcount,
            'max_cpg': max_cpg,
            'max_bp': args.max_bp,
            'revdict': self.genome.revdict_path,
            'genome': self.genome
        }
        self.args = args

    def break_to_chunks(self):
        """ Break range of sites to chunks of size 'step',
            while keeping chromosomes separated """
        # print a warning in case chunk size is too small
        step = self.args.chunk_size
        if step < self.args.max_cpg:
            msg = '[wt segment] WARNING: chunk_size is small compared to max_cpg and/or max_bp.\n' \
                  '                      It may cause wt segment to fail. It\'s best setting\n' \
                  '                      chunk_size > min{max_cpg, max_bp/2}'
            eprint(msg)

        if self.args.bed_file:
            df = load_blocks_file(self.args.bed_file)[['startCpG',
                                                       'endCpG']].dropna()
            # make sure bed file has no overlaps or duplicated regions
            is_nice, msg = is_block_file_nice(df)
            if not is_nice:
                msg = '[wt segment] ERROR: invalid bed file.\n' \
                      f'                    {msg}\n' \
                      f'                    Try: sort -k1,1 -k2,2n {self.args.bed_file} | ' \
                      'bedtools merge -i - | wgbstools convert --drop_empty -p -L -'
                eprint(msg)
                raise IllegalArgumentError('Invalid bed file')
            if df.shape[0] > 2 * 1e4:
                msg = '[wt segment] WARNING: bed file contains many regions.\n' \
                      '                      Segmentation will take a long time.\n' \
                      f'                      Consider running w/o -L flag and intersect the results\n'
                eprint(msg)

        else:  # No bed file provided
            gr = GenomicRegion(self.args)
            # whole genome - make a dummy "bed file" of the full chromosomes
            if gr.is_whole():
                cf = self.genome.get_chrom_cpg_size_table()
                cf['endCpG'] = np.cumsum(cf['size']) + 1
                cf['startCpG'] = cf['endCpG'] - cf['size']
                df = cf[['startCpG', 'endCpG']]
            # one region
            else:
                df = pd.DataFrame(columns=['startCpG', 'endCpG'],
                                  data=[gr.sites])

        # build a DataFrame of chunks, with a "tag"/label field,
        # so we know which chunks to merge later on.
        rf = pd.DataFrame()
        tags = []
        starts = []
        ends = []
        for ind, row in df.iterrows():
            start, end = row
            bords = list(range(start, end, step)) + [end]
            tags += [f'{start}-{end}'] * (len(bords) - 1)
            starts += bords[:-1]
            ends += bords[1:]
        return tags, starts, ends

    def run(self):
        # break input region/s to small chunks
        tags, starts, ends = self.break_to_chunks()
        # segment each chunk separately in a single thread
        p = Pool(self.args.threads)
        params = [(dict(self.param_dict, **{'sites': (s, e)}), )
                  for s, e in zip(starts, ends)]
        arr = p.starmap(segment_process, params)
        p.close()
        p.join()

        # merge chunks from the same "tag" group
        # (i.e. the same chromosome, or the same region of the provided bed file)
        df = pd.DataFrame()
        for tag in set(tags):
            carr = [arr[i] for i in range(len(arr)) if tags[i] == tag]
            merged = self.merge_df_list(carr)
            df = pd.concat([
                df,
                pd.DataFrame({
                    'startCpG': merged[:-1],
                    'endCpG': merged[1:]
                })
            ])
        self.dump_result(df.reset_index(drop=True))

    def merge_df_list(self, dflist):
        # Given a set of chunks to merge, recursively pairwise stich them.

        while len(dflist) > 1:
            p = Pool(self.args.threads)
            params = [(dflist[i - 1], dflist[i], self.param_dict)
                      for i in range(1, len(dflist), 2)]
            arr = p.starmap(stitch_2_dfs, params)
            p.close()
            p.join()

            last_df = [dflist[-1]] if len(dflist) % 2 else []
            dflist = arr + last_df
        return dflist[0]

    def dump_result(self, df):
        if df.empty:
            eprint('Empty blocks array')
            return

        # sort by startCpG and filter by CpGs
        nr_blocks = df.shape[0]
        df.sort_values(by=['startCpG'], inplace=True)
        df = df[df.endCpG - df.startCpG > self.args.min_cpg - 1].reset_index(
            drop=True)

        # verbose
        nr_blocks_filt = df.shape[0]
        nr_dropped = nr_blocks - nr_blocks_filt
        eprint(f'[wt segment] found {nr_blocks_filt:,} blocks\n' \
               f'             (dropped {nr_dropped:,} short blocks)')

        # add genomic loci and dump/print
        temp_path = next(tempfile._get_candidate_names())
        try:
            df.to_csv(temp_path, sep='\t', header=None, index=None)
            add_bed_to_cpgs(temp_path, self.genome.genome, self.args.out_path)
        finally:
            if op.isfile(temp_path):
                os.remove(temp_path)