Esempio n. 1
0
def main():
    """
    Merge files.
    Accumulate all reads / observations from multiple (>=2) input files,
    and output a single file of the same format.
    Supported formats: pat.gz, beta
    """
    args = parse_args()

    # validate input files
    input_files = args.input_files
    validate_files_list(input_files, min_len=2)

    # construct output path
    out_path = args.prefix + splitextgz(args.input_files[0])[1]
    if not delete_or_skip(out_path, args.force):
        return

    files_type = splitextgz(input_files[0])[1][1:]

    if files_type in ('beta', 'bin'):
        merge_betas(input_files, out_path)
    elif files_type == 'pat.gz':
        MergePats(input_files, args.prefix + '.pat', args.labels, args).merge_pats()
    elif files_type == 'unq.gz':
        merge_unqs()
    else:
        print('Unknown input format:', input_files[0])
        return
Esempio n. 2
0
def main():
    """
    Collapse beta file to blocks binary file, of the same beta format
    """

    args = parse_args()
    files = args.input_files
    validate_files_list(files, '.beta')

    if not op.isfile(args.blocks_file):
        eprint('Invalid blocks file:', args.blocks_file)
        return

    names = ['chr', 'sloc', 'eloc', 'ssite', 'esite']
    df = pd.read_csv(args.blocks_file, sep='\t', usecols=[0, 1, 2, 3, 4], header=None, names=names)

    nr_removed = df[df.ssite == df.esite].shape[0]
    if nr_removed:
        eprint('removed {} regions with no CpGs'.format(nr_removed))

    if args.debug:
        eprint(df[df.ssite == df.esite])

    df = df[df.ssite < df.esite]
    blocks_bins, filtered_indices = get_bins(df)

    with Pool() as p:
        for beta_path in files:
            params = (args, blocks_bins,
                      filtered_indices, beta_path, df[['chr', 'sloc', 'eloc']])
            p.apply_async(apply_filter_wrapper, params)
        p.close()
        p.join()
Esempio n. 3
0
def main(args):
    validate_files_list(args.input_files, '.pat.gz')

    gr = GenomicRegion(args)
    print(gr)
    for pat_file in args.input_files:
        print(splitextgz(op.basename(pat_file))[0])     # print file name
        PatVis(args, pat_file).print_results()
Esempio n. 4
0
def main():
    """
    Plot histogram of reads lengths of unq file
    Output to stdout the histogram values if requested
    """
    args = parse_args()
    validate_files_list(args.unq_paths, 'unq.gz')
    multi_FragLen(args)
Esempio n. 5
0
def main():
    """
    Convert beta file[s] to bed file[s].
    """
    args = parse_args()
    validate_files_list(args.beta_paths, '.beta')
    b = BetaToBigWig(args)
    for beta in args.beta_paths:
        b.run_beta_to_bed(beta)
Esempio n. 6
0
def main():
    """
    Compare between pairs of beta files, by plotting a 2d histogram
    for every pair.
    Drop sites with low coverage (< cov_thresh argument),
    for performance and robustness.
    """
    args = parse_args()
    validate_files_list(args.betas, '.beta', min_len=2)
    compare_all_paires(args.betas, args.min_cov, GenomicRegion(args).sites)
Esempio n. 7
0
def main():
    """
    Convert beta file[s] to Illumina-450K format.
    Output: a csv file with ~480K rows, for the ~480K Illumina sites,
            and with columns corresponding to the beta files.
            all values are in range [0, 1], or NaN.
    """
    args = parse_args()
    validate_files_list(args.input_files, '.beta')
    betas2csv(args)
Esempio n. 8
0
def main():
    """
    Convert bed[.gz] file[s] to beta file[s].
    bed file should be of the format (tab-separated):
    chr    start    end    meth    total
    """
    # todo: bed or bedGraph?
    args = parse_args()
    validate_files_list(args.bed_paths)
    bed2betas(args)
Esempio n. 9
0
def main():
    """
    Convert beta file[s] to bigwig file[s].
    Assuming bedGraphToBigWig is installed and in PATH
    """
    args = parse_args()
    validate_files_list(args.beta_paths, '.beta')

    b = BetaToBigWig(args)
    for beta in args.beta_paths:
        b.run_beta_to_bw(beta)
Esempio n. 10
0
def main():
    """
    Mix samples from K different pat files.
    Output a single mixed pat.gz[.csi] file - sorted, bgzipped and indexed -
    with an informative name.
    """
    args = parse_args()
    validate_files_list(args.pat_files, 'pat.gz', 2)

    if args.bed_file and (args.region or args.sites):
        eprint('-L, -s and -r are mutually exclusive')
        return

    mult_mix(args)
    return
Esempio n. 11
0
def main(args):
    validate_files_list(args.input_files, '.beta')
    BetaVis(args)