Esempio n. 1
0
def view_gr(pat, args, get_cmd=False):
    validate_single_file(pat, '.pat.gz')
    gr = GenomicRegion(args)
    if gr.is_whole():
        s = 1
        e = gr.genome.get_nr_sites() + 1
        cmd = f'gunzip -c {pat} '
    else:
        s, e = gr.sites
        ms = max(1, s - MAX_PAT_LEN)
        cmd = f'tabix {pat} {gr.chrom}:{ms}-{e - 1} '

    view_flags = set_view_flags(args)
    cmd += f' | {cview_tool} --sites "{s}\t{e}" ' + view_flags
    if hasattr(
            args,
            'sub_sample') and args.sub_sample is not None:  # sub-sample reads
        validate_local_exe(pat_sampler)
        cmd += f' | {pat_sampler} {args.sub_sample} '
    if not gr.is_whole():
        cmd += f' | sort -k2,2n -k3,3 '
    cmd += f' | {collapse_pat_script} - '
    if get_cmd:
        return cmd
    if args.out_path is not None:
        cmd += f' > {args.out_path}'
    subprocess_wrap_sigpipe(cmd)
Esempio n. 2
0
    def break_to_chunks(self):
        """ Break range of sites to chunks of size 'step',
            while keeping chromosomes separated """
        # print a warning in case chunk size is too small
        step = self.args.chunk_size
        if step < self.args.max_cpg:
            msg = '[wt segment] WARNING: chunk_size is small compared to max_cpg and/or max_bp.\n' \
                  '                      It may cause wt segment to fail. It\'s best setting\n' \
                  '                      chunk_size > min{max_cpg, max_bp/2}'
            eprint(msg)

        if self.args.bed_file:
            df = load_blocks_file(self.args.bed_file)[['startCpG',
                                                       'endCpG']].dropna()
            # make sure bed file has no overlaps or duplicated regions
            is_nice, msg = is_block_file_nice(df)
            if not is_nice:
                msg = '[wt segment] ERROR: invalid bed file.\n' \
                      f'                    {msg}\n' \
                      f'                    Try: sort -k1,1 -k2,2n {self.args.bed_file} | ' \
                      'bedtools merge -i - | wgbstools convert --drop_empty -p -L -'
                eprint(msg)
                raise IllegalArgumentError('Invalid bed file')
            if df.shape[0] > 2 * 1e4:
                msg = '[wt segment] WARNING: bed file contains many regions.\n' \
                      '                      Segmentation will take a long time.\n' \
                      f'                      Consider running w/o -L flag and intersect the results\n'
                eprint(msg)

        else:  # No bed file provided
            gr = GenomicRegion(self.args)
            # whole genome - make a dummy "bed file" of the full chromosomes
            if gr.is_whole():
                cf = self.genome.get_chrom_cpg_size_table()
                cf['endCpG'] = np.cumsum(cf['size']) + 1
                cf['startCpG'] = cf['endCpG'] - cf['size']
                df = cf[['startCpG', 'endCpG']]
            # one region
            else:
                df = pd.DataFrame(columns=['startCpG', 'endCpG'],
                                  data=[gr.sites])

        # build a DataFrame of chunks, with a "tag"/label field,
        # so we know which chunks to merge later on.
        rf = pd.DataFrame()
        tags = []
        starts = []
        ends = []
        for ind, row in df.iterrows():
            start, end = row
            bords = list(range(start, end, step)) + [end]
            tags += [f'{start}-{end}'] * (len(bords) - 1)
            starts += bords[:-1]
            ends += bords[1:]
        return tags, starts, ends
Esempio n. 3
0
    def __init__(self, args):
        self.args = args
        self.gr = GenomicRegion(args)
        self.debug = args.debug
        self.outdir = args.outdir
        if not op.isdir(self.outdir):
            raise IllegalArgumentError('Invalid output directory: ' +
                                       self.outdir)

        self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes
        self.ref_dict = self.load_dict()
Esempio n. 4
0
    def insert_borders(self, markers):
        ctable = self.fullres['table']
        start = self.fullres['start']

        # load borders from file
        # build gr to span the whole table
        bsites = '{}-{}'.format(start, start + ctable.shape[1])
        table_gr = GenomicRegion(sites=bsites, genome_name=self.gr.genome_name)
        borders = load_borders(self.blocks_path, table_gr, self.args.genome)
        if not borders.size:
            return self.fullres['text'], markers
        # pad right columns with space, if there are missing sites before the last border/s
        missing_width = borders[-1] - ctable.shape[1]
        if missing_width > 0:
            charar = np.chararray((ctable.shape[0], missing_width))
            charar[:] = ' '
            ctable = np.concatenate([ctable, charar], axis=1)

        # insert the borders:
        txt = table2text(np.insert(ctable, borders, BORDER, axis=1))

        # insert the borders to the markers line:
        markers_arr = np.array(list(markers.ljust(ctable.shape[1])))[:, None]
        rmark = ''.join(np.insert(markers_arr, borders, BORDER))
        return txt, rmark
Esempio n. 5
0
 def __init__(self, args):
     self.args = args
     self.out_dir = args.out_dir
     self.bam_path = args.bam_path
     self.debug = args.debug
     self.gr = GenomicRegion(args)
     self.validate_input()
Esempio n. 6
0
def convert_single_region(args):
    gr = GenomicRegion(args)
    if args.parsable:
        r = gr.region_str if args.sites else '{}-{}'.format(*gr.sites)
    else:
        r = gr
    print(r)
Esempio n. 7
0
 def __repr__(self):
     base_str = repr(self.base_contig
                     ) + '\n' if self.generate_type == 'backward' else ''
     repeat_str = (repr(GenomicRegion()) +
                   repr(SVLink())) * self.margin + repr(self.repeat_contig)
     repeat_str_list = [repeat_str] * self.repeat_time
     return base_str + '\n'.join(repeat_str_list)
Esempio n. 8
0
def compare_all_paires(args):
    betas = args.betas
    sites = GenomicRegion(args).sites
    tables = [load_beta_data(b, sites) for b in betas]
    names = [op.splitext(op.basename(b))[0] for b in betas]
    # break names to lines
    nnames = []
    k = 20
    for n in names:
        lst = [n[0 + i:k + i] for i in range(0, len(n), k)]
        nn = '\n'.join(lst)
        nnames.append(nn)

    N = len(tables)
    fig, axs = plt.subplots(N, N)
    for i in range(N):
        for j in range(i + 1):
            comp2(tables[i], tables[j], args.min_cov, axs[i, j])
        axs[i, 0].set_ylabel(nnames[i], fontsize=8)
    for j in range(N):
        axs[0, j].set_title(nnames[j], fontsize=8)

    for ax in axs.flat:
        ax.label_outer()

    fig.tight_layout()

    if args.outpath is not None:
        plt.savefig(args.outpath)
        eprint(f'[wt cmp] dumped figure to {args.outpath}')

    if args.show or args.outpath is None:
        plt.show()
Esempio n. 9
0
def main():
    """
    View the content of input file (pat/beta) as plain text.
    Possible filter by genomic region or sites range
    Output to stdout as default
    """
    parser = parse_args()
    args = parser.parse_args()

    if args.sub_sample is not None and not 1 >= args.sub_sample >= 0:
        parser.error('[wt view] sub-sampling rate must be within [0.0, 1.0]')

    # validate input file
    input_file = args.input_file
    validate_single_file(input_file)

    try:
        if input_file.endswith('.beta'):
            gr = GenomicRegion(args)
            view_beta(input_file, gr, args.out_path, args.bed_file)
        elif op.splitext(input_file)[1] in ('.lbeta', '.bin'):
            view_other_bin(input_file, args)
        elif input_file.endswith('.pat.gz'):
            cview(input_file, args)
        else:
            raise IllegalArgumentError('Unknown input format:', input_file)

    except BrokenPipeError:
        catch_BrokenPipeError()
Esempio n. 10
0
    def __init__(self, args):
        self.gr = GenomicRegion(args)
        self.start, self.end = self.gr.sites
        self.nr_sites = self.end - self.start
        self.args = args

        # load distances
        self.distances = self.load_pairwise_dists() if args.dists else None

        # drop duplicated files, while keeping original order
        seen = set()
        self.files = [x for x in args.input_files if not (x in seen or seen.add(x))]

        # load raw data:
        self.dsets = self.load_data()

        # load borders:
        self.borders = load_borders(args.blocks_path, self.gr) if args.blocks_path else None

        # Generate colors dictionary
        self.num2color_dict = generate_colors_dict(args.color_scheme)

        self.print_all()
        if self.args.plot:
            self.plot_all()
Esempio n. 11
0
def main():
    """
    Calculate the average coverage of one or more beta files.
    Print the results.
    """

    args = parse_args()

    sites = GenomicRegion(args).sites

    blocks_df = load_blocks_file(args.bed_file) if args.bed_file else None

    params = [(beta, sites, blocks_df, False) for beta in args.betas]
    # covs = [beta_cov(*p) for p in params]
    # return
    p = Pool(args.threads)
    covs = p.starmap(beta_cov, params)
    p.close()
    p.join()

    for cov, beta_path in zip(covs, args.betas):
        print('{}\t{:.2f}'.format(pretty_name(beta_path), cov))

    if args.plot:
        plot_hist([pretty_name(b) for b in args.betas], covs)
Esempio n. 12
0
 def __init__(self, unq, args, gr=None):
     self.args = args
     self.unq = unq
     self.gr = gr if gr else GenomicRegion(args)
     m = args.max_frag_size
     self.fill_arr_cmd = ' {if ($3 > %s) {$3 = %s}; arr[$3] += $5}' % (m, m)
     self.print_arr_cmd = ' END {for (x=1; x <= %s; x++) print arr[x]}\'' % m
Esempio n. 13
0
def main():
    """
    Convert beta file to bed file.
    """
    args = parse_args()
    validate_single_file(args.beta_path, '.beta')
    gr = GenomicRegion(args)
    beta_to_bed(args.beta_path, gr, args.bed_file, args.min_cov, args.mean, args.keep_na, args.force, args.outpath)
Esempio n. 14
0
    def __init__(self, args):
        eprint('mixing...')
        self.args = args
        self.gr = GenomicRegion(args)
        self.pats = args.pat_files
        self.dest_cov = args.cov
        self.bed = load_blocks_file(args.bed_file) if args.bed_file else None
        self.stats = pd.DataFrame(
            index=[splitextgz(op.basename(f))[0] for f in self.pats])
        self.nr_pats = len(self.pats)
        self.labels = self.validate_labels(args.labels)

        self.dest_rates = self.validate_rates(args.rates)
        self.covs = self.read_covs()
        self.adj_rates = self.adjust_rates()

        self.prefix = self.generate_prefix(args.out_dir, args.prefix)
Esempio n. 15
0
def main(args):
    validate_files_list(args.input_files, '.pat.gz')

    gr = GenomicRegion(args)
    print(gr)
    for pat_file in args.input_files:
        print(splitextgz(op.basename(pat_file))[0])     # print file name
        PatVis(args, pat_file).print_results()
Esempio n. 16
0
 def __init__(self, args, bam):
     self.args = args
     self.tmp_dir = None
     self.verbose = args.verbose
     self.out_dir = args.out_dir
     self.bam_path = bam
     self.gr = GenomicRegion(args)
     self.start_threads()
     self.cleanup()
Esempio n. 17
0
 def merge_pats(self):
     view_flags = []
     for i in range(len(self.pats)):
         v = ' '
         if self.args.strict:
             v += ' --strict'
         if self.args.min_len:
             v += ' --min_len {}'.format(self.args.min_len)
         if self.args.bed_file is not None:
             v += ' -L {}'.format(self.args.bed_file)
         gr = GenomicRegion(self.args)
         if not gr.is_whole():
             v += ' -s {}-{}'.format(*gr.sites)
         # v += ' -@ {}'.format(max(1, len(self.pats) // 16))
         view_flags.append(v)
     if not view_flags:
         view_flags = None
     self.fast_merge_pats(view_flags)
Esempio n. 18
0
 def __init__(self, args):
     self.args = args
     self.gr = GenomicRegion(args)
     self.outdir = args.outdir
     self.name = ''
     if not op.isdir(self.outdir):
         raise IllegalArgumentError('Invalid output directory: ' +
                                    self.outdir)
     self.chrom_sizes = GenomeRefPaths(args.genome).chrom_sizes
Esempio n. 19
0
def main():
    """
    Compare between pairs of beta files, by plotting a 2d histogram
    for every pair.
    Drop sites with low coverage (< cov_thresh argument),
    for performance and robustness.
    """
    args = parse_args()
    validate_files_list(args.betas, '.beta', min_len=2)
    compare_all_paires(args.betas, args.min_cov, GenomicRegion(args).sites)
Esempio n. 20
0
 def __init__(self, args, bam):
     self.args = args
     self.tmp_dir = None
     self.verbose = args.verbose
     self.out_dir = args.out_dir
     self.bam_path = bam
     self.homog_prop = args.homog_prop
     self.min_cpg = args.min_cpg
     self.gr = GenomicRegion(args)
     self.start_threads()
Esempio n. 21
0
 def __init__(self, args, pat_path):
     self.gr = GenomicRegion(args)
     self.args = args
     self.max_reps = args.max_reps if args.max_reps > 0 else sys.maxsize
     self.start, self.end = self.gr.sites
     self.pat_path = pat_path
     self.blocks_path = args.blocks_path
     self.uxm = args.uxm
     self.uxm_counts = {'U': 0, 'X': 0, 'M': 0}
     self.fullres = self.get_block()
Esempio n. 22
0
def view_pat_mult_proc(input_file, strict, sub_sample, grs, i, step):
    res = []
    for i in range(i, min(len(grs), i + step)):
        gr = GenomicRegion(region=grs[i])
        cmd = ViewPat(input_file, sys.stdout, gr, strict,
                      sub_sample).compose_awk_cmd()
        x = subprocess.check_output(cmd, shell=True)
        # print('x', cmd, x)
        res.append(x)
    return res
Esempio n. 23
0
def main(args):
    validate_file_list(args.input_files, '.pat.gz')

    # drop duplicated files, while keeping original order
    input_files = drop_dup_keep_order(args.input_files)

    gr = GenomicRegion(args)
    print(gr)
    for pat_file in input_files:
        print(splitextgz(op.basename(pat_file))[0])  # print file name
        PatVis(args, pat_file).print_results()
Esempio n. 24
0
    def __init__(self, args, file):
        self.gr = GenomicRegion(args)
        self.max_reps = args.max_reps if args.max_reps > 0 else sys.maxsize
        self.strict = args.strict
        self.min_len = args.min_len
        self.start, self.end = self.gr.sites
        self.file = file
        self.no_color = args.no_color
        self.max_width = self.end - self.start + 2 * MAX_PAT_LEN  # maximal width of the output (in characters)
        self.blocks_path = args.blocks_path
        self.no_dense = args.no_dense

        self.fullres = self.get_block()
Esempio n. 25
0
def main():
    """
    Test whether region is bimodal
    """
    parser = add_args()
    args = parse_args(parser)

    if args.bed_file is not None:
        test_multiple_regions(args.bed_file, args.pat, args.threads,
                              args.out_file, args.strict, args.min_len,
                              args.verbose)
    else:
        gr = GenomicRegion(args)
        test_single_region(args.pat, gr.chrom, gr.sites, args.strict,
                           args.min_len)
Esempio n. 26
0
def slow_conversion(df, genome):
    df = df.iloc[:, :3]
    startCpGs = []
    endCpGs = []
    for ind, row in df.iterrows():
        try:
            sites = GenomicRegion(region='{}:{}-{}'.format(*row),
                                  genome_name=genome).sites
        except IllegalArgumentError as e:
            sites = (np.nan, np.nan)
        startCpGs.append(sites[0])
        endCpGs.append(sites[1])
    df['startCpG'] = pd.Series(startCpGs, dtype='Int64').values
    df['endCpG'] = pd.Series(endCpGs, dtype='Int64').values
    return df
Esempio n. 27
0
def main():
    """
    Convert genomic region to CpG index range and vise versa
    """
    args = parse_args()

    if args.bed_path and (args.region or args.sites):
        eprint('-L, -s and -r are mutually exclusive')
        return

    if args.bed_path:
        convert_bed_file(args)
        return

    print(GenomicRegion(args))
Esempio n. 28
0
    def __init__(self, args):
        print('in mixer')
        self.args = args
        self.gr = GenomicRegion(args)
        self.pats = args.pat_files
        self.dest_cov = args.cov
        self.bed = None if not args.bed_file else BedFileWrap(args.bed_file)
        self.stats = pd.DataFrame(
            index=[splitextgz(op.basename(f))[0] for f in self.pats])
        self.nr_pats = len(self.pats)
        self.labels = self.validate_labels(args.labels)

        self.dest_rates = self.validate_rates(args.rates)
        self.covs = self.read_covs()
        self.adj_rates = self.adjust_rates()

        self.prefix = self.generate_prefix(args.out_dir, args.prefix)
Esempio n. 29
0
def multi_FragLen(args):
    if args.bed_file and (args.region or args.sites):
        eprint('-L, -s and -r are mutually exclusive')
        return

    if args.region or args.sites:
        grs = [GenomicRegion(args)]
    elif args.bed_file:
        grs = BedFileWrap(args.bed_file).iter_grs()
    else:
        grs = []

    for unq in args.unq_paths:
        run_single_unq(unq, grs, args)

    if args.display:
        plt.show()
Esempio n. 30
0
def main():
    """
    View the content of input file (pat/unq/beta) as plain text.
    Possible filter by genomic region or sites range
    Output to stdout as default
    """
    args = parse_args()
    # validate input file
    input_file = args.input_file
    validate_single_file(input_file)

    if args.sub_sample is not None and not 1 > args.sub_sample > 0:
        eprint('sub-sampling rate must be within (0.0, 1.0)')
        return

    if args.bed_file and (args.region or args.sites):
        eprint('-L, -s and -r are mutually exclusive')
        return

    bed_wrapper = BedFileWrap(args.bed_file) if args.bed_file else None
    gr = GenomicRegion(args)

    try:
        if input_file.endswith('.beta') or input_file.endswith('.bin'):
            view_beta(input_file, gr, args.out_path)
        elif input_file.endswith('.pat.gz'):
            if bed_wrapper:
                view_pat_bed_multiprocess(args, bed_wrapper)
            else:
                vp = ViewPat(input_file, args.out_path, gr, args.strict,
                             args.sub_sample, bed_wrapper, args.min_len)
                vp.view_pat(args.awk_engine)
        elif input_file.endswith('.unq.gz'):
            grs = bed_wrapper.iter_grs() if bed_wrapper else [gr]
            for gr in grs:
                ViewUnq(input_file, args.out_path, gr, args.inflate).view()
        else:
            raise IllegalArgumentError('Unknown input format:', input_file)

    except BrokenPipeError:
        # Python flushes standard streams on exit; redirect remaining output
        # to devnull to avoid another BrokenPipeError at shutdown
        devnull = os.open(os.devnull, os.O_WRONLY)
        os.dup2(devnull, sys.stdout.fileno())
        sys.exit(1)  # Python exits with error code 1 on EPIPE