Exemple #1
0
def main():
    """
    Merge files.
    Accumulate all reads / observations from multiple (>=2) input files,
    and output a single file of the same format.
    Supported formats: pat.gz, beta
    """
    args = parse_args()

    # validate input files
    input_files = args.input_files
    validate_files_list(input_files, min_len=2)

    # construct output path
    out_path = args.prefix + splitextgz(args.input_files[0])[1]
    if not delete_or_skip(out_path, args.force):
        return

    files_type = splitextgz(input_files[0])[1][1:]

    if files_type in ('beta', 'bin'):
        merge_betas(input_files, out_path)
    elif files_type == 'pat.gz':
        MergePats(input_files, args.prefix + '.pat', args.labels, args).merge_pats()
    elif files_type == 'unq.gz':
        merge_unqs()
    else:
        print('Unknown input format:', input_files[0])
        return
Exemple #2
0
def main():
    """
    Merge files.
    Accumulate all reads / observations from multiple (>=2) input files,
    and output a single file of the same format.
    Supported formats: pat.gz, beta
    """
    args = parse_args()

    # validate input files
    input_files = args.input_files

    # construct output path
    out_path = args.prefix + splitextgz(args.input_files[0])[1]

    if op.realpath(out_path) in [op.realpath(p) for p in args.input_files]:
        eprint('[wt merge] Error output path is identical ' \
                'to one of the input files {out_path}')
        return

    if not delete_or_skip(out_path, args.force):
        return

    files_type = splitextgz(input_files[0])[1][1:]

    if files_type in ('beta', 'bin'):
        merge_betas(input_files, out_path)
    elif files_type == 'pat.gz':
        MergePats(input_files, args.prefix + '.pat.gz', args.labels,
                  args).merge_pats()
    else:
        print('Unknown input format:', input_files[0])
        return
Exemple #3
0
 def __init__(self, input_file, args):
     self.args = args
     self.in_file = input_file
     self.suff = splitextgz(self.in_file)[1][1:]
     c = BedTsv if 'bed' in self.suff or 'tsv' in self.suff else PatUnq
     self.ftype = c(input_file)
     self.validate_file()
Exemple #4
0
def bed2betas(args):

    # merge with the reference CpG bed file,
    # so the #lines in file will include all 28217448 sites (with NaN as 0)
    nrows = 100000 if args.debug else None
    try:
        rf = None  # Reference dictionary
        for bed in args.bed_paths:
            eprint('Converting {}...'.format(op.basename(bed)))
            # Check if bed should be skipped:
            outpath = op.join(args.outdir,
                              splitextgz(op.basename(bed))[0]) + '.beta'
            if not delete_or_skip(outpath, args.force):
                continue

            # Load dict (at most once) and bed
            if rf is None:
                rf = load_dict(nrows=nrows, genome_name=args.genome)
            df = load_bed(bed, nrows, args.genome == 'mm9')

            # merge dict with bed, then dump
            res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0)
            trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath)

    except pd.errors.ParserError as e:
        eprint('Invalid input file.\n{}'.format(e))
        return
Exemple #5
0
def bed2betas(args):

    # merge with the reference CpG bed file,
    # so the #lines in file will include all 28217448 sites (with NaN as 0)
    region = 'chr1:10469-876225' if args.debug else None
    nrows = 10000 if args.debug else None
    try:
        rf = None       # Reference dictionary
        for bed in args.bed_paths:
            eprint(f'[wt bed] Converting {op.basename(bed)}...')
            # Check if bed should be skipped
            outpath = op.join(args.outdir, splitextgz(op.basename(bed))[0] + '.beta')
            if not delete_or_skip(outpath, args.force):
                continue

            # Load dict (at most once) and bed
            if rf is None:
                rf = load_dict_section(region, args.genome)
            df = load_bed(bed, nrows, args.add_one)

            # todo: implement in C++.
            # merge dict with bed, then dump
            res = rf.merge(df, how='left', on=['chr', 'start']).fillna(0)
            trim_to_uint8(np.array(res[['meth', 'total']])).tofile(outpath)

    except pd.errors.ParserError as e:
        eprint(f'[wt bed] Invalid input file.\n{e}')
        return
Exemple #6
0
def homog_process(pat, blocks, args, outdir, prefix):
    name = splitextgz(op.basename(pat))[0]
    if prefix is None:
        prefix = op.join(outdir, name)
    bin_path = prefix + '.uxm'
    bed_path = prefix + '.uxm.bed.gz'
    bed = args.bed
    binary = args.binary or (not args.binary and not bed)
    if should_be_skipped(args.force, bin_path, bed_path, binary, bed):
        homog_log(f'skipping {name}. Use -f to overwrite')
        return

    # generate rate_cmd:
    l = args.rlen
    rate_cmd = f' -l {l} -r '
    if args.thresholds:
        rate_cmd += f'0,{args.thresholds},1'
    else:
        th1 = round(1 - (l - 1) / l, 3) + 0.001
        th2 = round((l - 1) / l, 3)
        rate_cmd += f'0,{th1},{th2},1 '

    # for a long marker file (>10K marker), 
    # parse the whole pat file instead of running "cview -L BED"
    view_full = blocks.shape[0] > 1e4

    df = ctool_wrap(pat, name, args.blocks_file, rate_cmd, view_full, args.verbose)
    df = pd.concat([blocks.reset_index(drop=True), df], axis=1)
    df = blocks.merge(df, how='left', on=COORDS_COLS5)

    if binary:
        trim_uxm_to_uint8(df[list('UXM')].values, args.nr_bits).tofile(bin_path)
    if bed:
        df.to_csv(bed_path, sep='\t', header=None, index=None)
    return df
Exemple #7
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError(f'Invalid pat suffix: {pat_path}')

    suff = '.lbeta' if args.lbeta else '.beta'
    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + suff)
    if not delete_or_skip(out_beta, force):
        return

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        arr = mult_pat2beta(pat_path, args)
    else:
        nr_sites = GenomeRefPaths(args.genome).get_nr_sites()
        cmd += f' {pat_path} | {pat2beta_tool} {1} {nr_sites + 1}'
        x = subprocess.check_output(cmd, shell=True).decode()
        arr = np.fromstring(x, dtype=int, sep=' ').reshape((-1, 2))

    trim_to_uint8(arr, args.lbeta).tofile(out_beta)
    return out_beta
Exemple #8
0
def main(args):
    validate_files_list(args.input_files, '.pat.gz')

    gr = GenomicRegion(args)
    print(gr)
    for pat_file in args.input_files:
        print(splitextgz(op.basename(pat_file))[0])     # print file name
        PatVis(args, pat_file).print_results()
Exemple #9
0
 def __init__(self, input_file, force=True,
              threads=multiprocessing.cpu_count()):
     self.force = force
     self.threads = threads
     self.in_file = input_file
     self.suff = splitextgz(self.in_file)[1][1:]
     self.ftype = Bed() if 'bed' in self.suff else Pat()
     self.validate_file()
Exemple #10
0
def compose_fig_path(unq, outdir, grs):
    if not outdir:
        return
    res = op.join(outdir, op.basename(splitextgz(unq)[0]))
    if grs and len(grs) == 1:
        res += '.{}'.format(grs[0].region_str)
    res += '.png'
    return res
Exemple #11
0
def filter_existing_files(files, out_dir, lbeta):
    files_to_process = []
    suff = '.lbeta' if lbeta else '.bin'
    for beta in files:
        prefix = op.join(out_dir, splitextgz(op.basename(beta))[0])
        if not op.isfile(prefix + suff):
            files_to_process.append(beta)
        else:
            b2b_log(f'Skipping {beta}. Use -f flag to overwrite')
    return files_to_process
Exemple #12
0
    def validate_labels(self, labels):
        if labels is None:
            labels = [
                splitextgz(op.basename(p))[0].split('-')[0].lower()
                for p in self.pats
            ]

        if len(labels) != self.nr_pats:
            raise IllegalArgumentError('len(labels) != len(files)')
        return labels
Exemple #13
0
def main(args):
    validate_file_list(args.input_files, '.pat.gz')

    # drop duplicated files, while keeping original order
    input_files = drop_dup_keep_order(args.input_files)

    gr = GenomicRegion(args)
    print(gr)
    for pat_file in input_files:
        print(splitextgz(op.basename(pat_file))[0])  # print file name
        PatVis(args, pat_file).print_results()
Exemple #14
0
def plot_hist(data, max_frag_size, pat):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(np.arange(1, data.size + 1), data)
    major_ticks = np.arange(1, max_frag_size, 5)
    minor_ticks = np.arange(1, max_frag_size, 1)

    ax.set_xticks(major_ticks)
    ax.set_xticks(minor_ticks, minor=True)

    # Or if you want different settings for the grids:
    ax.grid(which='minor', alpha=0.2)
    ax.grid(which='major', alpha=0.5)
    plt.ylim(bottom=0)
    plt.xlim(left=1)
    plt.title('Fragment lengths (CpGs)\n' + op.basename(splitextgz(pat)[0]))
Exemple #15
0
 def generate_prefix(self, outdir, prefix):
     if prefix:
         if op.dirname(prefix):
             validate_dir(op.dirname(prefix))
         return prefix
     else:
         validate_dir(outdir)
         # compose output path:
         pats_bnames = [splitextgz(op.basename(f))[0] for f in self.pats]
         res = '_'.join(
             [str(x) for t in zip(pats_bnames, self.dest_rates) for x in t])
         region = '' if self.gr.sites is None else '_{}'.format(
             self.gr.region_str)
         res += '_cov_{:.2f}{}'.format(self.dest_cov, region)
         res = op.join(outdir, res)
     return res
Exemple #16
0
    def __init__(self, args):
        print('in mixer')
        self.args = args
        self.gr = GenomicRegion(args)
        self.pats = args.pat_files
        self.dest_cov = args.cov
        self.bed = None if not args.bed_file else BedFileWrap(args.bed_file)
        self.stats = pd.DataFrame(
            index=[splitextgz(op.basename(f))[0] for f in self.pats])
        self.nr_pats = len(self.pats)
        self.labels = self.validate_labels(args.labels)

        self.dest_rates = self.validate_rates(args.rates)
        self.covs = self.read_covs()
        self.adj_rates = self.adjust_rates()

        self.prefix = self.generate_prefix(args.out_dir, args.prefix)
Exemple #17
0
    def __init__(self, args):
        eprint('mixing...')
        self.args = args
        self.gr = GenomicRegion(args)
        self.pats = args.pat_files
        self.dest_cov = args.cov
        self.bed = load_blocks_file(args.bed_file) if args.bed_file else None
        self.stats = pd.DataFrame(
            index=[splitextgz(op.basename(f))[0] for f in self.pats])
        self.nr_pats = len(self.pats)
        self.labels = self.validate_labels(args.labels)

        self.dest_rates = self.validate_rates(args.rates)
        self.covs = self.read_covs()
        self.adj_rates = self.adjust_rates()

        self.prefix = self.generate_prefix(args.out_dir, args.prefix)
Exemple #18
0
def run_single_unq(unq, grs, args):
    eprint(unq)
    if not grs:  # process the whole unq file (no -L,-s,-r was specified)
        x = FragLen(unq, args).run_whole_genome()
    else:
        x = np.sum([FragLen(unq, args, gr).run_small_region() for gr in grs],
                   axis=0)

    # print values to stdout:
    if args.verbose:
        np.savetxt(sys.stdout, x.reshape((1, -1)), fmt='%s', delimiter=' ')

    # plot:
    plt.figure()
    plt.plot(np.arange(x.size), x.flatten())
    plt.title('Fragment lengths\n' + op.basename(splitextgz(unq)[0]))

    # dump figure:
    if args.outdir:
        plt.savefig(compose_fig_path(unq, args.outdir, grs))
Exemple #19
0
def main():
    """
    Visualize wgbs files
    Possible inputs:
        - a pat.gz file
        - One or more beta files
    """

    args = parse_args()
    file_type = splitextgz(args.input_files[0])[1]

    # print title
    if args.title:
        print('{}'.format(args.title))

    if file_type in ('.beta', '.bin'):
        beta_vis_main(args)
    elif file_type == '.pat.gz':
        pat_vis_main(args)
    else:
        print('Unsupported file type:', file_type)
Exemple #20
0
def pat2beta(pat_path, out_dir, args, force=True):
    validate_single_file(pat_path)

    if pat_path.endswith('.pat.gz'):
        cmd = 'gunzip -cd'
    elif pat_path.endswith('.pat'):
        cmd = 'cat'
    else:
        raise IllegalArgumentError('Invalid pat suffix: {}'.format(pat_path))

    out_beta = op.join(out_dir, splitextgz(op.basename(pat_path))[0] + '.beta')
    if not delete_or_skip(out_beta, force):
        return
    nr_sites = GenomeRefPaths(args.genome).nr_sites

    if args.threads > 1 and pat_path.endswith('.pat.gz') and op.isfile(
            pat_path + '.csi'):
        return mult_pat2beta(pat_path, out_beta, nr_sites, args)

    cmd += ' {} | {} {} {}'.format(pat_path, PAT2BETA_TOOL, out_beta, nr_sites)
    subprocess.check_call(cmd, shell=True)
    return out_beta
Exemple #21
0
def compose_fig_path(pat, outdir):
    if outdir:
        return op.join(outdir, op.basename(splitextgz(pat)[0])) + '.png'