Example #1
0
def spa(args):
    """
    %prog spa spafiles

    Convert chromosome ordering from SPA to simple lists. First column is the
    reference order.
    """
    from jcvi.algorithms.graph import merge_paths
    from jcvi.utils.cbook import uniqify

    p = OptionParser(spa.__doc__)
    p.add_option(
        "--unmapped",
        default=False,
        action="store_true",
        help="Include unmapped scaffolds in the list [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    spafiles = args
    paths = []
    mappings = []
    missings = []
    for spafile in spafiles:
        fp = open(spafile)
        path = []
        mapping = []
        missing = []
        for row in fp:
            if row[0] == '#' or not row.strip():
                continue

            atoms = row.rstrip().split('\t')
            if len(atoms) == 2:
                a, c2 = atoms
                assert a == "unmapped"
                missing.append(c2)
                continue

            c1, c2, orientation = atoms
            path.append(c1)
            mapping.append(c2)

        paths.append(uniqify(path))
        mappings.append(mapping)
        missings.append(missing)

    ref = merge_paths(paths)
    print "ref", len(ref), ",".join(ref)
    for spafile, mapping, missing in zip(spafiles, mappings, missings):
        mapping = [x for x in mapping if "random" not in x]
        mapping = uniqify(mapping)
        if len(mapping) < 50 and opts.unmapped:
            mapping = uniqify(mapping + missing)

        print spafile, len(mapping), ",".join(mapping)
Example #2
0
def spa(args):
    """
    %prog spa spafiles

    Convert chromosome ordering from SPA to simple lists. First column is the
    reference order.
    """
    from jcvi.algorithms.graph import merge_paths
    from jcvi.utils.cbook import uniqify

    p = OptionParser(spa.__doc__)
    p.add_option("--unmapped", default=False, action="store_true",
                 help="Include unmapped scaffolds in the list [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    spafiles = args
    paths = []
    mappings = []
    missings = []
    for spafile in spafiles:
        fp = open(spafile)
        path = []
        mapping = []
        missing = []
        for row in fp:
            if row[0] == '#' or not row.strip():
                continue

            atoms = row.rstrip().split('\t')
            if len(atoms) == 2:
                a, c2 = atoms
                assert a == "unmapped"
                missing.append(c2)
                continue

            c1, c2, orientation = atoms
            path.append(c1)
            mapping.append(c2)

        paths.append(uniqify(path))
        mappings.append(mapping)
        missings.append(missing)

    ref = merge_paths(paths)
    print "ref", len(ref), ",".join(ref)
    for spafile, mapping, missing in zip(spafiles, mappings, missings):
        mapping = [x for x in mapping if "random" not in x]
        mapping = uniqify(mapping)
        if len(mapping) < 50 and opts.unmapped:
            mapping = uniqify(mapping + missing)

        print spafile, len(mapping), ",".join(mapping)
Example #3
0
def compilevcf(args):
    """
    %prog compilevcf samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compilevcf.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.add_option(
        "--nofilter",
        default=False,
        action="store_true",
        help="Do not filter the variants",
    )
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (samples, ) = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    filtered = not opts.nofilter
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print("\n".join(uids), file=fw)
        fw.close()

    run_args = [(x, filtered, cleanup, store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for _ in p.map_async(run_compile, run_args).get():
        continue
Example #4
0
File: str.py Project: ascendo/jcvi
def compile(args):
    """
    %prog compile samples.csv

    Compile vcf results into master spreadsheet.
    """
    from multiprocessing import Pool

    p = OptionParser(compile.__doc__)
    p.add_option("--db", default="hg38,hg38-named",
                 help="Use these lobSTR db")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    dbs = opts.db.split(",")
    mkdir(workdir)
    os.chdir(workdir)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

        # Generate two alleles
        dipuids = []
        for uid in uids:
            dipuids.extend([uid + ".1", uid + ".2"])
        fw = open("header.ids", "w")
        print >> fw, ",".join(dipuids)
        fw.close()

    p = Pool(processes=opts.cpus)
    run_args = [(x, opts.store, opts.cleanup) for x in vcffiles]
    #run(run_args[0])
    for res in p.map_async(run, run_args).get():
        continue
Example #5
0
def compilevcf(args):
    """
    %prog compilevcf samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compilevcf.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.add_option("--nofilter", default=False, action="store_true",
                 help="Do not filter the variants")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    filtered = not opts.nofilter
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

    run_args = [(x, filtered, cleanup, store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_compile, run_args).get():
        continue
Example #6
0
def compile(args):
    """
    %prog compile samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compile.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

    p = Pool(processes=opts.cpus)
    run_args = [(x, store, cleanup) for x in vcffiles]
    for res in p.map_async(run, run_args).get():
        continue