Example #1
0
def filtervcf(args):
    """
    %prog filtervcf NA12878.hg38.vcf.gz

    Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list
    of vcf files.
    """
    p = OptionParser(filtervcf.__doc__)
    p.set_home("lobstr", default="/mnt/software/lobSTR")
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    lhome = opts.lobstr_home
    store = opts.output_path

    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]

    vcffiles = [x for x in vcffiles if ".filtered." not in x]

    run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_filter, run_args).get():
        continue
Example #2
0
def filterdata(args):
    """
    %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids

    Filter subset of data after dropping remove.ids.
    """
    p = OptionParser(filterdata.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 6:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, af, remove, final = args
    df, m, samples, loci = read_binfile(binfile, sampleids, strids)
    remove = [x.strip() for x in open(remove)]
    removes = set(remove)
    final = [x.strip() for x in open(final)]
    assert len(loci) == len(remove) + len(final)

    fp = open(af)
    percentiles = {}
    for row in fp:
        sname, counts = row.split()
        countsd = af_to_counts(counts)
        percentile = counts_to_percentile(countsd)
        percentiles[sname] = percentile

    run_args = []
    for i, sname in enumerate(loci):
        if sname in removes:
            continue
        a = m[:, i]
        percentile = percentiles[sname]
        run_args.append((i, a, percentile))

    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    res = []
    for r in p.map_async(convert_to_percentile, run_args).get():
        res.append(r)
    res.sort()

    # Write mask (P-value) matrix
    ii, pvalues = zip(*res)
    m = np.vstack(pvalues).T
    write_csv("final.mask.tsv", m, samples, final)

    df.drop(remove, inplace=True, axis=1)
    df.columns = final

    # Save a copy of the raw numpy array
    filtered_bin = "filtered.bin"
    m = df.as_matrix()
    m[m < 0] = -1
    m.tofile(filtered_bin)
    logging.debug("Binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
Example #3
0
def filtervcf(args):
    """
    %prog filtervcf NA12878.hg38.vcf.gz

    Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list
    of vcf files.
    """
    p = OptionParser(filtervcf.__doc__)
    p.set_home("lobstr", default="/mnt/software/lobSTR")
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    lhome = opts.lobstr_home
    store = opts.output_path

    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]

    vcffiles = [x for x in vcffiles if ".filtered." not in x]

    run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_filter, run_args).get():
        continue
Example #4
0
File: str.py Project: qiao-xin/jcvi
def filterdata(args):
    """
    %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids

    Filter subset of data after dropping remove.ids.
    """
    p = OptionParser(filterdata.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 6:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, af, remove, final = args
    df, m, samples, loci = read_binfile(binfile, sampleids, strids)
    remove = [x.strip() for x in open(remove)]
    removes = set(remove)
    final = [x.strip() for x in open(final)]
    assert len(loci) == len(remove) + len(final)

    fp = open(af)
    percentiles = {}
    for row in fp:
        sname, counts = row.split()
        countsd = af_to_counts(counts)
        percentile = counts_to_percentile(countsd)
        percentiles[sname] = percentile

    run_args = []
    for i, sname in enumerate(loci):
        if sname in removes:
            continue
        a = m[:, i]
        percentile = percentiles[sname]
        run_args.append((i, a, percentile))

    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    res = []
    for r in p.map_async(convert_to_percentile, run_args).get():
        res.append(r)
    res.sort()

    # Write mask (P-value) matrix
    ii, pvalues = zip(*res)
    m = np.vstack(pvalues).T
    write_csv("final.mask.tsv", m, samples, final)

    df.drop(remove, inplace=True, axis=1)
    df.columns = final

    # Save a copy of the raw numpy array
    filtered_bin = "filtered.bin"
    m = df.as_matrix()
    m[m < 0] = -1
    m.tofile(filtered_bin)
    logging.debug("Binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
Example #5
0
def compilevcf(args):
    """
    %prog compilevcf samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compilevcf.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.add_option(
        "--nofilter",
        default=False,
        action="store_true",
        help="Do not filter the variants",
    )
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (samples, ) = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    filtered = not opts.nofilter
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print("\n".join(uids), file=fw)
        fw.close()

    run_args = [(x, filtered, cleanup, store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for _ in p.map_async(run_compile, run_args).get():
        continue
Example #6
0
File: str.py Project: ascendo/jcvi
def compile(args):
    """
    %prog compile samples.csv

    Compile vcf results into master spreadsheet.
    """
    from multiprocessing import Pool

    p = OptionParser(compile.__doc__)
    p.add_option("--db", default="hg38,hg38-named",
                 help="Use these lobSTR db")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    dbs = opts.db.split(",")
    mkdir(workdir)
    os.chdir(workdir)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

        # Generate two alleles
        dipuids = []
        for uid in uids:
            dipuids.extend([uid + ".1", uid + ".2"])
        fw = open("header.ids", "w")
        print >> fw, ",".join(dipuids)
        fw.close()

    p = Pool(processes=opts.cpus)
    run_args = [(x, opts.store, opts.cleanup) for x in vcffiles]
    #run(run_args[0])
    for res in p.map_async(run, run_args).get():
        continue
Example #7
0
def compilevcf(args):
    """
    %prog compilevcf samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compilevcf.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.add_option("--nofilter", default=False, action="store_true",
                 help="Do not filter the variants")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    filtered = not opts.nofilter
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

    run_args = [(x, filtered, cleanup, store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_compile, run_args).get():
        continue
Example #8
0
def compile(args):
    """
    %prog compile samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compile.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

    p = Pool(processes=opts.cpus)
    run_args = [(x, store, cleanup) for x in vcffiles]
    for res in p.map_async(run, run_args).get():
        continue
Example #9
0
def compare(args):
    """
    %prog compare NA12878_array_hg38.bed *.seg

    Compare cnv output to known ground truths.
    """
    p = OptionParser(compare.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    truths = args[0]
    cnvoutputs = args[1:]
    cpus = min(len(cnvoutputs), opts.cpus)
    p = Pool(processes=cpus)
    results = []
    files = [(x, truths) for x in cnvoutputs]
    r = p.map_async(compare_worker, files, callback=results.append)
    r.wait()

    for res in results:
        print("\n".join(res))
Example #10
0
File: cnv.py Project: xuanblo/jcvi
def compare(args):
    """
    %prog compare NA12878_array_hg38.bed *.seg

    Compare cnv output to known ground truths.
    """
    p = OptionParser(compare.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    truths = args[0]
    cnvoutputs = args[1:]
    cpus = min(len(cnvoutputs), opts.cpus)
    p = Pool(processes=cpus)
    results = []
    files = [(x, truths) for x in cnvoutputs]
    r = p.map_async(compare_worker, files, callback=results.append)
    r.wait()

    for res in results:
        print "\n".join(res)