Ejemplo n.º 1
0
def main():
    """
    Visualize wgbs files
    Possible inputs:
        - pat.gz file[s]
        - beta files[s]
    """

    parser = parse_args()
    args = parser.parse_args()
    if args.uxm and not (0.5 <= args.uxm <= 1):
        parser.error("uxm value must be between 0.5 and 1")
    if args.sub_sample is not None and not 1 >= args.sub_sample >= 0:
        parser.error('[wt vis] sub-sampling rate must be within [0.0, 1.0]')

    # print title
    if args.title:
        print(args.title)

    first_file = args.input_files[0]
    if first_file.endswith(('.beta', '.bin')):
        beta_vis_main(args)
    elif first_file.endswith('.pat.gz'):
        pat_vis_main(args)
    else:
        eprint('[wt vis] Unsupported file type:', first_file)
Ejemplo n.º 2
0
    def load_blocks(self):
        # load blocks file and filter it by CpG and bg length

        df = load_blocks_file(self.args.blocks_path)
        orig_nr_blocks = df.shape[0]

        # filter by lenCpG
        df['lenCpG'] = df['endCpG'] - df['startCpG']
        df = df[df['lenCpG'] >= self.args.min_cpg]
        df = df[df['lenCpG'] <= self.args.max_cpg]

        # filter by len in bp
        df['len'] = df['end'] - df['start']
        df = df[df['len'] >= self.args.min_bp]
        df = df[df['len'] <= self.args.max_bp]

        df.reset_index(drop=True, inplace=True)

        # print stats
        if self.verbose:
            eprint(f'loaded {orig_nr_blocks:,} blocks')
            if df.shape[0] != orig_nr_blocks:
                eprint(f'droppd to {df.shape[0]:,} ')

        return df
Ejemplo n.º 3
0
 def mbias_merge(self, name, pat_parts):
     if not self.args.mbias:
         return
     try:
         mdir = op.join(self.out_dir, name) + '.mbias'
         if not op.isdir(mdir):
             os.mkdir(mdir)
         tpaths = []
         for x in ['OB', 'OT']:
             mbias_parts = [
                 p.replace('.pat.gz', f'.mb.{x}.txt') for p in pat_parts
                 if p
             ]
             mbias_parts = [pd.read_csv(m, sep='\t') for m in mbias_parts]
             df = mbias_parts[0]
             for m in mbias_parts[1:]:
                 df += m
             cpath = op.join(mdir, name) + f'.mbias.{x}.txt'
             df.to_csv(cpath, sep='\t', index=None)
             tpaths.append(cpath)
         from mbias_plot import plot_mbias
         plot_mbias(tpaths, mdir)
     except Exception as e:
         eprint('[wt bam2pat] failed in mbias')
         eprint(e)
Ejemplo n.º 4
0
    def dump_result(self, df):
        if df.empty:
            eprint('Empty blocks array')
            return

        # sort by startCpG and filter by CpGs
        nr_blocks = df.shape[0]
        df.sort_values(by=['startCpG'], inplace=True)
        df = df[df.endCpG - df.startCpG > self.args.min_cpg - 1].reset_index(
            drop=True)

        # verbose
        nr_blocks_filt = df.shape[0]
        nr_dropped = nr_blocks - nr_blocks_filt
        eprint(f'[wt segment] found {nr_blocks_filt:,} blocks\n' \
               f'             (dropped {nr_dropped:,} short blocks)')

        # add genomic loci and dump/print
        temp_path = next(tempfile._get_candidate_names())
        try:
            df.to_csv(temp_path, sep='\t', header=None, index=None)
            add_bed_to_cpgs(temp_path, self.genome.genome, self.args.out_path)
        finally:
            if op.isfile(temp_path):
                os.remove(temp_path)
Ejemplo n.º 5
0
def compare_all_paires(args):
    betas = args.betas
    sites = GenomicRegion(args).sites
    tables = [load_beta_data(b, sites) for b in betas]
    names = [op.splitext(op.basename(b))[0] for b in betas]
    # break names to lines
    nnames = []
    k = 20
    for n in names:
        lst = [n[0 + i:k + i] for i in range(0, len(n), k)]
        nn = '\n'.join(lst)
        nnames.append(nn)

    N = len(tables)
    fig, axs = plt.subplots(N, N)
    for i in range(N):
        for j in range(i + 1):
            comp2(tables[i], tables[j], args.min_cov, axs[i, j])
        axs[i, 0].set_ylabel(nnames[i], fontsize=8)
    for j in range(N):
        axs[0, j].set_title(nnames[j], fontsize=8)

    for ax in axs.flat:
        ax.label_outer()

    fig.tight_layout()

    if args.outpath is not None:
        plt.savefig(args.outpath)
        eprint(f'[wt cmp] dumped figure to {args.outpath}')

    if args.show or args.outpath is None:
        plt.show()
Ejemplo n.º 6
0
def read_blocks_and_test(tabixed_bed_file,
                         cur_region,
                         pat_file,
                         is_strict,
                         min_len,
                         verbose=False):
    tabix_cmd = f"tabix {tabixed_bed_file} {cur_region}"
    cur_blocks_lines = subprocess.check_output(tabix_cmd,
                                               shell=True).decode().split("\n")
    p_val_list = []
    for line in cur_blocks_lines:
        if not line.strip():
            continue
        tokens = line.split("\t")
        sites = (int(tokens[3]), int(tokens[4]))
        p_val = test_single_region(pat_file,
                                   tokens[0],
                                   sites,
                                   is_strict,
                                   min_len,
                                   should_print=False)
        p_val = p_val.astype(np.float32)
        p_val_list.append((line, p_val))
    if verbose:
        eprint(f"[wt bimodal] finished processesing {cur_region}")
    return p_val_list
Ejemplo n.º 7
0
def apply_filter_wrapper(args, blocks_bins, finds, beta_path, df):
    try:
        # load beta file:
        data = load_beta_data(beta_path)

        # reduce to blocks:
        blocks_bins[-1] -= 1
        reduced_data = np.add.reduceat(data, blocks_bins)[finds][:-1]

        # dump to file
        out_name = splitext(splitext(basename(args.blocks_file))[0])[0]
        out_name = splitext(basename(beta_path))[0] + '_' + out_name + '.bin'
        out_name = out_name.replace('_genome', '')
        out_name = op.join(args.out_dir, out_name)

        trim_to_uint8(reduced_data).tofile(out_name)
        print(out_name)

        if args.bedGraph:
            with np.errstate(divide='ignore', invalid='ignore'):
                beta_vals = reduced_data[:, 0] / reduced_data[:, 1]
                eprint(beta_vals.shape, df.shape)
            # beta_vals[reduced_data[:, 1] == 0] = np.nan
            df['beta'] = beta_vals
            df.to_csv(out_name.replace('.bin', '.bedGraph'), sep='\t',
                      index=None, header=None, na_rep=-1,
                      float_format='%.2f')

    except Exception as e:
        print('Failed with beta', beta_path)
        print('Exception:', e)
Ejemplo n.º 8
0
def bed2betas(args):

    # merge with the reference CpG bed file,
    # so the #lines in file will include all 28217448 sites (with NaN as 0)
    nrows = 100000 if args.debug else None
    try:
        rf = None  # Reference dictionary
        for bed in args.bed_paths:
            eprint('Converting {}...'.format(op.basename(bed)))
            # Check if bed should be skipped:
            outpath = op.join(args.outdir,
                              splitextgz(op.basename(bed))[0]) + '.beta'
            if not delete_or_skip(outpath, args.force):
                continue

            # Load dict (at most once) and bed
            if rf is None:
                rf = load_dict(nrows=nrows, genome_name=args.genome)
            df = load_bed(bed, nrows, args.genome == 'mm9')

            # merge dict with bed, then dump
            res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0)
            trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath)

    except pd.errors.ParserError as e:
        eprint('Invalid input file.\n{}'.format(e))
        return
Ejemplo n.º 9
0
    def load_bins(self):
        if self.verbose:
            eprint('loading bins...')
        # breakpoint()
        nr_cols = (3 if self.args.uxm else 2)
        binsize = self.gf['binsize'][0] / self.orig_nr_blocks
        binsize /= nr_cols
        if binsize != int(binsize):
            raise IllegalArgumentError(
                'Error: bin file size does not match blocks number')

        dtype = np.uint8 if binsize == 1 else np.uint16

        dfU = pd.DataFrame()
        dfM = pd.DataFrame()
        if self.hypo:
            dfU = np.zeros((self.nr_blocks, self.gf_nodup.shape[0]),
                           dtype=np.float)
        if self.hyper:
            dfM = np.zeros((self.nr_blocks, self.gf_nodup.shape[0]),
                           dtype=np.float)

        from tqdm import tqdm  # todo: only if installed
        for ind, row in tqdm(self.gf_nodup.iterrows(),
                             total=self.gf_nodup.shape[0]):
            data = np.fromfile(row['full_path'], dtype).reshape(
                (-1, nr_cols))[self.keepinds, :]
            if self.hypo:
                dfU[:, ind] = table2vec(data, 'U', self.arsg.min_cov)
            if self.hyper:
                dfM[:, ind] = table2vec(data, 'M', self.arsg.min_cov)

        return self.array2df(dfU), self.array2df(dfM)
Ejemplo n.º 10
0
 def __init__(self, args):
     self.args = args
     self.dfU = pd.DataFrame()
     self.dfM = pd.DataFrame()
     self.blocks = pd.DataFrame()
     self.nr_blocks = 0
     self.orig_nr_blocks = 0
     self.keepinds = None
     self.groups = None
     self.verbose = args.verbose
     self.hyper, self.hypo = self.set_hypo_hyper(args.hyper, args.hypo)
     self.validate_args()
     # validate output dir:
     if not op.isdir(args.out_dir):
         os.mkdir(args.out_dir)
     # load groups
     self.gf = load_groups_file(args.groups_file, args.input_dir,
                                args.verbose)
     self.gf_nodup = self.gf.drop_duplicates(subset='fname').reset_index(
         drop=True)
     # validate target is in groups file
     target = self.args.target
     if target and target not in self.gf['group'].values:
         eprint(
             f'target {target} not in groups file {self.args.groups_file}')
         eprint('Possible targets:', sorted(self.gf['group'].unique()))
         raise IllegalArgumentError()
Ejemplo n.º 11
0
def bed2betas(args):

    # merge with the reference CpG bed file,
    # so the #lines in file will include all 28217448 sites (with NaN as 0)
    region = 'chr1:10469-876225' if args.debug else None
    nrows = 10000 if args.debug else None
    try:
        rf = None       # Reference dictionary
        for bed in args.bed_paths:
            eprint(f'[wt bed] Converting {op.basename(bed)}...')
            # Check if bed should be skipped
            outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0] + '.beta')
            if not delete_or_skip(outpath, args.force):
                continue

            # Load dict (at most once) and bed
            if rf is None:
                rf = load_dict_section(region, args.genome)
            df = load_bed(bed, nrows, args.add_one)

            # todo: implement in C++.
            # merge dict with bed, then dump
            res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0)
            trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath)

    except pd.errors.ParserError as e:
        eprint(f'[wt bed] Invalid input file.\n{e}')
        return
Ejemplo n.º 12
0
def main():
    """
    Merge files.
    Accumulate all reads / observations from multiple (>=2) input files,
    and output a single file of the same format.
    Supported formats: pat.gz, beta
    """
    args = parse_args()

    # validate input files
    input_files = args.input_files

    # construct output path
    out_path = args.prefix + splitextgz(args.input_files[0])[1]

    if op.realpath(out_path) in [op.realpath(p) for p in args.input_files]:
        eprint('[wt merge] Error output path is identical ' \
                'to one of the input files {out_path}')
        return

    if not delete_or_skip(out_path, args.force):
        return

    files_type = splitextgz(input_files[0])[1][1:]

    if files_type in ('beta', 'bin'):
        merge_betas(input_files, out_path)
    elif files_type == 'pat.gz':
        MergePats(input_files, args.prefix + '.pat.gz', args.labels,
                  args).merge_pats()
    else:
        print('Unknown input format:', input_files[0])
        return
Ejemplo n.º 13
0
 def bgzip_tabix_dict(self, dict_path):
     eprint('bgzip and index...')
     subprocess.check_call('bgzip -@ {} -f '.format(self.args.threads) +
                           dict_path,
                           shell=True)
     subprocess.check_call('tabix -Cf -b 2 -e 2 {}.gz'.format(dict_path),
                           shell=True)
Ejemplo n.º 14
0
 def bgzip_tabix_dict(self, dict_path):
     eprint('[wt init] bgzip and index...')
     subprocess.check_call(f'bgzip -@ {self.args.threads} -f {dict_path}',
                           shell=True)
     subprocess.check_call(f'tabix -Cf -b 2 -e 2 {dict_path}.gz',
                           shell=True)
     return dict_path + '.gz'
Ejemplo n.º 15
0
def load_seq_by_chrom(chrom, ref_path, fai_df, debug):
    eprint(chrom)

    # get chromosome's location in the fasta
    chrom, size, offset, width = fai_df[fai_df['chr'] == chrom].values[0]

    # load the chromosome's subsequence from fasta
    with open(ref_path, 'r') as f:
        f.seek(offset)
        nr_lines = size // (
            width - 1) + 1  # number of lines to read for current chromosome
        to_read = nr_lines * width
        if debug:
            to_read = min(to_read, 100 * width)
        txt = f.read(to_read)
    seq = ''.join(s.strip() for s in txt.split('\n')).upper()

    # remove possible trailing characters (belonging to the next chromosome)
    end_pos = seq.rfind('>')
    if end_pos != -1:
        seq = seq[:end_pos]

    # validate sequence length
    if len(seq) != size and not debug:
        raise IllegalArgumentError('Error while loading {} from fasta: '
                                   'read {} bases instead of {}'.format(
                                       chrom, len(seq), size))

    # Find CpG sites loci
    tf = pd.DataFrame([m.start() + 1 for m in re.finditer('CG', seq)],
                      columns=['loc'])
    tf['chr'] = chrom
    return tf[['chr', 'loc']]
Ejemplo n.º 16
0
 def validate_nr_sites(self, nr_sites):
     if self.args.debug:
         return
     d = {'mm9': 13120864, 'hg19': 28217448}
     if self.name in d.keys():
         if nr_sites != d[self.name]:
             msg = f'[wt init] WARNING: number of sites of the reference genome '
             msg += f'{self.name} is usually {d[self.name]}, but you got {nr_sites}'
             eprint(msg)
Ejemplo n.º 17
0
 def validate_nr_sites(self, nr_sites):
     if self.args.debug:
         return
     d = {'mm9': 13120864, 'hg19': 28217448}
     if self.name in d.keys():
         if nr_sites != d[self.name]:
             eprint('Warning: number of sites of the reference '
                    'genome {} is usually {}, but you got {}'.format(
                        self.name, d[self.name], nr_sites))
Ejemplo n.º 18
0
def is_region_empty(view_cmd, region, verbose):
    # check if there are reads in the bam file for the requested region
    view_cmd += ' | head -1'
    if not subprocess.check_output(
            view_cmd, shell=True, stderr=subprocess.PIPE).decode().strip():
        eprint(f'[wt bam2pat] Skipping region {region}, no reads found')
        if verbose:
            eprint('[wt bam2pat] ' + view_cmd)
        return True
    return False
Ejemplo n.º 19
0
def main():
    """
    Find markers (blocks) to differentiate between two or more groups of samples
    (collapsed beta files or homog binary files).
    """
    args = parse_args()
    if not args.uxm:
        eprint('Only --uxm mode is currently supported')
        raise NotImplementedError  #todo: implement
    MarkersFinder(args).run()
Ejemplo n.º 20
0
def print_help(short=False):
    msg = 'Usage: wgbs_tools.py COMMAND [OPTIONS]\n\nOptional commands:\n'
    for key in sorted(callbacks.keys()):
        docs = callbacks[key].__doc__
        msg += '\n- ' + key
        if docs and not short:
            msg += docs
    if short:
        msg += '\nUse [-h] or COMMAND -h flag for additional information'
    eprint(msg)
Ejemplo n.º 21
0
def main():
    """
    Change the default genome reference.
    """
    args = parse_args()
    if args.name:
        set_def_ref(args.name)
        eprint(f'[wt def] changed default genome to {args.name}')
    if args.ls:
        print_genomes()
    elif not args.name:
        eprint('[wt def] you must specify either --name or -ls')
Ejemplo n.º 22
0
def load_bed(bed_path, nrows=None):
    try:
        # TODO: handle a bed with a header line? But support stdin as input...
        df = pd.read_csv(bed_path,
                         sep='\t',
                         header=None,
                         nrows=nrows,
                         comment='#')
        df.columns = COORDS_COLS3 + list(df.columns)[3:]
        return df
    except pd.errors.EmptyDataError as e:
        eprint(f'[wt convert] ERROR: empty bed file')
        raise IllegalArgumentError('Invalid bed file')
Ejemplo n.º 23
0
 def dump_params(self):
     """ Dump a parameter file """
     outpath = op.join(self.args.out_dir, 'params.txt')
     with open(outpath, 'w') as f:
         for key in vars(self.args):
             val = getattr(self.args, key)
             if key == 'beta_list_file':
                 val = None
             if key == 'betas':
                 val = ' '.join(val)
             f.write(f'{key}:{val}\n')
             # f.write(f'#> {sample}\n' )
     eprint(f'dumped parameter file to {outpath}')
Ejemplo n.º 24
0
 def load_data_chunk(self, blocks_df):
     # load methylation data from beta files collapsed to the blocks in blocks_df
     if self.verbose:
         self.chunk_count += 1
         nr_samples = len(self.gf['fname'].unique())
         eprint(f'{self.chunk_count}/{self.nr_chunks} ) ' \
                f'loading data for {blocks_df.shape[0]:,} blocks over' \
                f' {nr_samples} samples...')
     return get_table(blocks_df=blocks_df.copy(),
                      gf=self.gf,
                      min_cov=self.args.min_cov,
                      threads=self.args.threads,
                      verbose=False,
                      group=False)
Ejemplo n.º 25
0
    def get_fasta(self):
        # download fasta from UCSC, unless the fasta file is provided
        if self.ref_path is not None:
            validate_single_file(self.ref_path)
            return

        # no FASTA path provided. Attempt to download one
        ref_path = op.join(self.out_dir, f'{self.name}.fa.gz')
        url = f'https://hgdownload.soe.ucsc.edu/goldenPath/{self.name}/bigZips/{self.name}.fa.gz'
        cmd = f'curl {url} -o {ref_path}'
        eprint(
            f'[wt init] No reference FASTA provided. Attempting to download from\n\t{url}'
        )
        p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        output, error = p.communicate()
        if p.returncode:
            eprint(
                f'[wt init] Failed downloading reference for genome {self.name}: %d\n%s\n%s'
                % (p.returncode, output.decode(), error.decode()))
            eprint(
                f'[wt init] Try downloading yourself and use --fasta_name flag, or check the "name" parameter'
            )
            raise IllegalArgumentError(f'[wt init] No reference FASTA found')
        eprint(
            f'[wt init] successfully downloaded FASTA. Now gunzip and bgzip it...'
        )
        cmd = f'gunzip {ref_path} && bgzip -@ {self.args.threads} {ref_path[:-3]}'
        subprocess.check_call(cmd, shell=True)
        self.ref_path = ref_path
Ejemplo n.º 26
0
def run_single_pat(pat, args):
    eprint(pat)
    fl = FragLen(pat, args)
    if args.region or args.sites:
        x = fl.run_small_region()
    elif args.bed_file:
        x = fl.run_bed()
    else:
        x = fl.run_whole_genome()

    if not x.sum():
        eprint(f'[wt frag] Empty list of lengths for {pat}')
        return

    # print values to stdout:
    if args.verbose:
        np.savetxt(sys.stdout, x.reshape((1, -1)), fmt='%s', delimiter=' ')

    # plot:
    if args.outdir or args.display:
        if args.verbose:
            eprint('[wt frag] plotting...')
        plot_hist(x.flatten(), args.max_frag_size, pat)

    # dump figure:
    if args.outdir:
        fpath = compose_fig_path(pat, args.outdir)
        if args.verbose:
            eprint(f'[wt frag] dumping {fpath}...')
        plt.savefig(fpath)
Ejemplo n.º 27
0
    def run(self):

        # load all data
        self.blocks = self.load_blocks_file()
        self.dfU, self.dfM = self.load_bins()
        for group in sorted(self.gf['group'].unique()):
            if self.args.target and group != self.args.target:
                continue
            eprint(group)
            self.group = group
            tfU = self.find_markers_group(self.dfU, 'U')
            tfM = self.find_markers_group(self.dfM, 'M')
            tf = pd.concat([tfU, tfM])
            self.dump_results(tf)
Ejemplo n.º 28
0
    def set_regions(self):
        if self.gr.region_str:
            return [self.gr.region_str]

        cmd = f'samtools idxstats {self.bam_path} | cut -f1 '
        p = subprocess.Popen(cmd,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        output, error = p.communicate()
        if p.returncode or not output:
            eprint("[wt bam2pat] Failed with samtools idxstats %d\n%s\n%s" %
                   (p.returncode, output.decode(), error.decode()))
            eprint(cmd)
            eprint('[wt bam2pat] falied to find chromosomes')
            return []
        nofilt_chroms = output.decode()[:-1].split('\n')
        filt_chroms = [c for c in nofilt_chroms if 'chr' in c]
        if filt_chroms:
            filt_chroms = [
                c for c in filt_chroms if re.match(r'^chr([\d]+|[XYM])$', c)
            ]
        else:
            filt_chroms = [c for c in nofilt_chroms if c in CHROMS]
        chroms = list(sorted(filt_chroms, key=chromosome_order))
        if not chroms:
            eprint('[wt bam2pat] Failed retrieving valid chromosome names')
            raise IllegalArgumentError('Failed')

        return chroms
Ejemplo n.º 29
0
 def add_anno(self):
     if self.args is None or self.is_whole() or 'no_anno' not in self.args:
         return
     elif self.args.no_anno:
         return
     anno_path = self.genome.annotations
     if anno_path is None:
         return
     try:
         cmd = f'tabix {anno_path} {self.region_str} | cut -f4- | uniq'
         return subprocess.check_output(cmd, shell=True).decode().strip()
     except subprocess.CalledProcessError:
         eprint(
             f'Failed to retrieve annotation for reagion {self.region_str}')
Ejemplo n.º 30
0
def run_command():
    try:
        command = sys.argv[1]
        if command not in callbacks.keys():
            eprint('Invalid command:', command)
            print_help(short=True)
            return 1

        with patch.object(sys, 'argv', sys.argv[1:]):
            callbacks[command]()

    except IllegalArgumentError as e:
        eprint('Invalid input argument\n{}'.format(e))
        return 1