Example #1
0
def main(arguments=None):
    """Main method"""
    arguments = sys.argv[1:] if arguments is None else arguments
    parser = generate_argparser()
    args = parser.parse_args(args=arguments)
    # HELP MENU
    if args.morehelp:
        modulehelp(MODULENAMES)
        sys.exit()
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.mvf, 'read')
    # Argument Pre-processing
    if args.allele_groups:
        groups = {}
        for elem in args.allele_groups:
            elem = elem.split(':')
            groups[elem[0]] = mvf.get_sample_indices(labels=elem[1].split(','))
        args.allele_groups = groups.copy()
        for grp0, grp1 in combinations(groups, 2):
            if set(groups[grp0]) & set(groups[grp1]):
                raise RuntimeError("Groups contain same element",
                                   set(groups[grp0]) & set(groups[grp1]))
    if args.speciesgroups:
        groups = {}
        for elem in args.speciesgroups:
            elem = elem.split(':')
            groups[elem[0]] = mvf.get_sample_indices(labels=elem[1].split(','))
        args.speciesgroups = groups.copy()
        for specgroup in groups:
            ngroup = 0
            for allelegroup in args.allele_groups.values():
                if set(allelegroup) & set(groups[specgroup]):
                    ngroup += 1
                    if ngroup > 1:
                        raise RuntimeError(specgroup, "split across 2+ groups")
    # MODULES
    if args.module == 'Coverage':
        module = Coverage(params=vars(args))
    elif args.module == 'GroupUniqueAlleleWindow':
        module = GroupUniqueAlleleWindow(params=vars(args))
    elif args.module == 'PiDiversityWindow':
        module = PiDiversityWindow(params=vars(args))
    elif args.module == 'PairwiseNS':
        module = PairwiseNS(params=vars(args))
    # RUN MODULE
    module.analyze(mvf)

    return ''
Example #2
0
def main(arguments=None):
    """Main method"""
    arguments = sys.argv[1:] if arguments is None else arguments
    parser = generate_argparser()
    args = parser.parse_args(args=arguments)
    mvf = MultiVariantFile(args.mvf, 'read')
    flavor = mvf.metadata['flavor']
    if (flavor in ("dna", "rna") and args.outdata == "prot") or (
            flavor == "prot" and args.outdata in ("dna", "rna")):
        raise RuntimeError(
            "--outdata {} incompatiable with '{}' flavor mvf".format(
                args.outdata, flavor))
    sample_cols = mvf.get_sample_indices(args.samples or None)
    labels = mvf.get_sample_labels(sample_cols)
    current_contig = ''
    seqs = {}
    for contig, _, allelesets in mvf.iterentries(quiet=args.quiet,
                                                 decode=True):
        if contig != current_contig:
            if seqs:
                with open(
                        "{}.{}.fa".format(
                            args.outprefix,
                            mvf.metadata['contigs'][contig]['label']),
                        'wt') as outfile:
                    for seqname in sorted(seqs):
                        outfile.write(">{}\n{}\n".format(
                            seqname, ''.join(seqs[seqname])))
            seqs = None
            seqs = {}
            current_contig = contig[:]
        for col, label in zip(sample_cols, labels):
            if label not in seqs:
                seqs[label] = []
            if flavor in ('dna', 'rna'):
                seqs[label].append(allelesets[0][col] == 'X' and 'N'
                                   or allelesets[0][col])
            elif flavor in ('codon', 'prot') and (args.outdata == 'prot'):
                seqs[label].append(allelesets[0][col])
            elif flavor == 'codon' and args.outdata == 'dna':
                seqs[label].extend([
                    allelesets[x][col] == 'X' and 'N' or allelesets[x][col]
                    for x in (1, 2, 3)
                ])
    if seqs:
        with open(
                "{}.{}.fa".format(args.outprefix,
                                  mvf.metadata['contigs'][contig]['label']),
                'wt') as outfile:
            for seqname in sorted(seqs):
                outfile.write(">{}\n{}\n".format(seqname,
                                                 ''.join(seqs[seqname])))
            seqs = None
            seqs = {}
    return ''
Example #3
0
def main(arguments=sys.argv[1:]):
    """Main method for mvf_filter"""
    parser = argparse.ArgumentParser(description="""
    Filters and Transforms MVF files""")
    parser.add_argument("--mvf", help="input MVF file")
    parser.add_argument("--out", help="output MVF file")
    parser.add_argument("--actions", nargs='*',
                        help=("set of actions:args to perform,"
                              " note these are done in order as listed"))
    parser.add_argument("--test", help="manually input a line for testing")
    parser.add_argument("--testnchar", type=int,
                        help="total number of samples for test string")
    parser.add_argument("--modulehelp", action="store_true",
                        help="prints full module list and descriptions")
    parser.add_argument("--linebuffer", type=int, default=100000,
                        help="number of lines to write at once to MVF")
    parser.add_argument("--verbose", action="store_true",
                        help="report every line (for debugging)")
    parser.add_argument("--overwrite", action="store_true",
                        help="USE WITH CAUTION: force overwrite of outputs")
    parser.add_argument("--quiet", action="store_true",
                        help="suppress progress meter")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-26")
        sys.exit()
    args = parser.parse_args(args=arguments)
    time0 = time()
    if args.modulehelp:
        modulehelp()
    if not args.mvf and not args.test:
        raise RuntimeError("No input file specified with --mvf")
    if not args.out and not args.test:
        raise RuntimeError("No output file specified with --outs")
    if not args.actions:
        raise RuntimeError("No --actions specified!")
    ## Establish Input MVF
    if args.test:
        ncol = args.testnchar or len(args.test)
    else:
        mvf = MultiVariantFile(args.mvf, 'read')
        ncol = mvf.metadata['ncol']
    ## Create Actionset
    actionset = build_actionset(args.actions, ncol)
    ##TESTING MODE
    if args.test:
        loc, alleles = args.test.split()
        linefail = False
        transformed = False
        #invar = invariant (single character)
        #refvar (all different than reference, two chars)
        #onecov (single coverage, + is second character)
        #onevar (one variable base, + is third character)
        #full = full alleles (all chars)
        if args.verbose:
            print(alleles)
        linetype = get_linetype(alleles)
        sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            sys.stdout.write("Applying action {} ({}): ".format(
                actionname, actiontype))
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
                    sys.stdout.write("Filter Fail\n")
                    break
                else:
                    sys.stdout.write("Filter Pass\n")
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
                    sys.stdout.write("Transform removed all alleles\n")
                    break
                else:
                    sys.stdout.write("Transform result {}\n".format(alleles))
            elif actiontype == 'location':
                if not actionfunc([int(x) for x in loc.split(':')]):
                    linefail = True
                    sys.stdout.write("Location Fail\n")
                    break
                else:
                    sys.stdout.write("Location Pass\n")
        if not linefail:
            if transformed:
                if linetype == 'full':
                    alleles = encode_mvfstring(alleles)
                if alleles:
                    test_output = "{}\t{}\n".format(loc, alleles)
                    sys.stdout.write("Final output = {}\n".format(
                        test_output))
                else:
                    sys.stdout.write("Transform removed all alleles\n")
            else:
                sys.stdout.write("No changes applied\n")
                sys.stdout.write("Final output = {}\n".format(args.test))
        sys.exit()
    ## MAIN MODE
    ## Set up file handler
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    ### reprocess header if actions are used that filter columns
    if any(x == y[0] for x in ('columns', 'collapsepriority')
           for y in actionset):
        labels = outmvf.metadata['labels'][:]
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actionname == 'columns':
                labels = [labels[x] for x in actionarg]
            elif actionname == 'collapsepriority':
                labels = [labels[x] for x in xrange(len(labels))
                          if x not in actionarg[1:]]
        oldindicies = mvf.get_sample_indices(labels)
        newsamples = {}
        for i, _ in enumerate(labels):
            newsamples[i] = mvf.metadata['samples'][oldindicies[i]]
        outmvf.metadata['samples'] = newsamples.copy()
        outmvf.metadata['labels'] = labels[:]
    outmvf.write_data(outmvf.get_header())
    ## End header editing
    linebuffer = []
    nbuffer = 0
    for chrom, pos, allelesets in mvf.iterentries(decode=False):
        linefail = False
        transformed = False
        #invar = invariant (single character)
        #refvar (all different than reference, two chars)
        #onecov (single coverage, + is second character)
        #onevar (one variable base, + is third character)
        #full = full alleles (all chars)
        alleles = allelesets[0]
        linetype = get_linetype(alleles)
        if linetype == 'empty':
            continue
        if args.verbose:
            sys.stdout.write(" {} {}".format(alleles, linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
            elif actiontype == 'location':
                if not actionfunc([chrom, pos]):
                    linefail = True
            if linefail:
                break
        if not linefail:
            if transformed:
                if linetype == 'full':
                    alleles = mvf.encode(alleles)
                if not alleles:
                    linefail = True
        if not linefail:
            nbuffer += 1
            linebuffer.append((chrom, pos, (alleles,)))
            if args.verbose:
                sys.stdout.write("{}\n".format(alleles))
            if nbuffer == args.linebuffer:
                outmvf.write_entries(linebuffer)
                linebuffer = []
                nbuffer = 0
        elif args.verbose:
            sys.stdout.write("FAIL\n")
    if linebuffer:
        outmvf.write_entries(linebuffer)
        linebuffer = []
    if not args.quiet:
        print("Completed in {} seconds".format(time() - time0))
    return ''
Example #4
0
def main(arguments=sys.argv[1:]):
    """Main MVF Chromoplot method"""
    pallette = Pallette()
    parser = argparse.ArgumentParser(description="""
    Makes chromoplots from MVF format""")
    parser.add_argument("--mvf", help="Input MVF file", required=True)
    parser.add_argument("--outprefix", help="output prefix (not required)")
    parser.add_argument("--samples", nargs='*', required=True,
                        help="3 or more taxa to use for quartets")
    parser.add_argument("--outgroup", nargs='*', required=True,
                        help="1 or more outgroups to use for quartets")
    parser.add_argument("--windowsize", type=int, default=100000)
    parser.add_argument("--contigs", nargs='*',
                        help="""order of contigs/chromosomes
                                defaults to order present in MVF
                                """)
    parser.add_argument("--majority", action="store_true",
                        help="call majority pattern in each window")
    parser.add_argument("--infotrack", action="store_true",
                        help="""additional coverage information track
                                on the bottom""")
    parser.add_argument("--emptymask", choices=pallette.colornames,
                        default="none",
                        help="mask empty regions with color (default=none)")
    parser.add_argument("--yscale", default=20, type=int,
                        help="number of pixels tall for each track")
    parser.add_argument("--xscale", default=1, type=int,
                        help="number of pixels wide for each window")
    parser.add_argument("--colors", nargs=3, choices=pallette.colornames,
                        help="three colors to use for chromoplot")
    parser.add_argument("--quiet", action="store_true",
                        help="suppress progress meter")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-01: Initial Public Release")
        sys.exit()
    if args.colors:
        pallette.basecolors = args.colors
    ## Establish MVF and parse chromosome information
    mvf = MultiVariantFile(args.mvf, 'read')
    contignames = args.contigs or []
    master_contigs = []
    for contigname in contignames:
        contig_found = False
        for contigid in mvf.metadata['contigs']:
            if (contigname == contigid or
                    contigname == mvf.metadata['contigs'][contigid]['label']):
                master_contigs.append((
                    contigid, mvf.metadata['contigs'][contigid]['label'],
                    mvf.metadata['contigs'][contigid]['length']))
                contig_found = True
        if contig_found:
            continue
        raise RuntimeError(contigname, "not found in MVF contig ids or labels")
    quartets = [(x, y, z, outgroup) for x, y, z in
                combinations(args.samples, 3) for outgroup in args.outgroup]
    ## Begin iterations
    for quartet in quartets:
        params = {'contigs': master_contigs[:],
                  'outpath': args.outprefix or '_'.join(quartet) + ".png",
                  'labels': quartet,
                  'windowsize': args.windowsize,
                  'majority': args.majority,
                  'infotrack': args.infotrack,
                  'quiet': args.quiet,
                  'yscale': args.yscale,
                  'xscale': args.xscale}
        chromoplot = Chromoplot(params=params, pallette=pallette)
        quartet_indices = mvf.get_sample_indices(labels=quartet)
        for contig, pos, allelesets in mvf.iterentries(
                subset=quartet_indices, decode=True,
                quiet=args.quiet, contigs=[x[0] for x in master_contigs]):
            alleles = allelesets[0]
            if '-' in alleles:
                site_code = 'gap'
            elif any(x not in 'ATGCatgc' for x in alleles):
                site_code = 'ambiguous'
            elif alleles[3] not in alleles[:3]:
                site_code = 'nonpolar'
            elif len(set(alleles)) > 2:
                site_code = 'triallelic'
            else:
                site_code = sum([2**(3-j) * (alleles[j] != alleles[3])
                                 for j in xrange(3)])
            chromoplot.add_data(contig, int(pos // args.windowsize), site_code)
        chromoplot.plot_chromoplot()
        chromoplot.write_total_log()
    return ''
Example #5
0
def main(arguments=sys.argv[1:]):
    """Main MVF Treemaker"""
    parser = argparse.ArgumentParser(
        description="""
    Process MVF into alignment"""
    )
    parser.add_argument("--mvf", help="inputmvf")
    parser.add_argument("--out", help="tree list output file")
    parser.add_argument("--samples", nargs="*", help="one or more taxon labels, default=all")
    parser.add_argument("--raxml_outgroups", nargs="*", help="select outgroups to use in RAxML")
    parser.add_argument(
        "--rootwith",
        nargs="*",
        help="""root output trees with
                                these taxa after RAxML""",
    )
    parser.add_argument("--contigs", nargs="*", help="choose one or more contigs, default=all")
    parser.add_argument("--outputcontiglabels", action="store_true", help="output contig labels instead of ids")
    parser.add_argument("--outputempty", action="store_true", help="output entries of windows with no data")
    parser.add_argument(
        "--hapmode",
        default="none",
        choices=["none", "randomone", "randomboth", "major", "minor", "majorminor"],
        help="""haplotype splitting mode.
                                'none' = no splitting;
                                'randomone' = pick one allele randomly
                                              (recommended);
                                'randomboth = pick alleles randomly,
                                              keep both;
                                'major' = pick the more common allele;
                                'minor' = pick the less common allele;
                                'majorminor' = put the major in 'a' and
                                               minor in 'b'
                            """,
    )
    parser.add_argument(
        "--windowsize",
        type=int,
        default=10000,
        help="""specify genomic region size,
                                or use -1 for whole contig""",
    )
    parser.add_argument("--minsites", type=int, default=100, help="""minimum number of sites [100]""")
    parser.add_argument(
        "--minsitedepth",
        type=int,
        default=1,
        help="""mininum depth of sites to use in alignment
                                [1]""",
    )
    parser.add_argument(
        "--minseqcoverage",
        type=float,
        default=0.1,
        help="""proportion of total alignment a sequence
                                must cover to be retianed [0.1]""",
    )
    parser.add_argument("--mindepth", type=int, default=4, help="""minimum number of sequences [4]""")
    parser.add_argument(
        "--bootstrap",
        type=int,
        help="""turn on rapid bootstrapping for RAxML and
                             perform specified number of replicates""",
    )
    parser.add_argument("--raxml_model", default="GTRGAMMA", help="""choose custom RAxML model [GTRGAMMA]""")
    parser.add_argument("--raxmlpath", help="manually specify RAxML path")
    parser.add_argument("--raxmlopts", default="", help="specify additional RAxML arguments")
    parser.add_argument(
        "--duplicateseq",
        default="dontuse",
        choices=["dontuse", "keep", "remove"],
        help="""[dontuse] remove for tree making,
                                replace as zero-branch-length sister taxa;
                                keep=keep in for tree making,
                                may cause errors for RAxML;
                                remove=remove entirely from alignment""",
    )
    parser.add_argument("--tempdir", default="raxmltemp", help="""temporary dir. location default=./tempdir""")
    parser.add_argument("--tempprefix", default="mvftree", help="""temporary file prefix, default=mvftree""")
    parser.add_argument("--quiet", action="store_true", help="suppress screen output")
    parser.add_argument("-v", "--version", action="store_true", help="display version information")

    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-26")
        sys.exit()
    ## ESTABLISH FILE OBJECTS
    args.contigs = args.contigs or []
    mvf = MultiVariantFile(args.mvf, "read")
    treefile = OutputFile(
        args.out,
        headers=[
            "contig",
            "windowstart",
            "windowsize",
            "tree",
            "topology",
            "topoid",
            # 'templabels', ### USED FOR DEBUGGING ###
            "alignlength",
            "aligndepth",
            "status",
        ],
    )
    topofile = OutputFile(args.out + ".counts", headers=["rank", "topology", "count"])
    sample_cols = args.samples and mvf.get_sample_indices(args.samples) or []
    if args.tempdir:
        tmpdir = os.path.abspath(args.tempdir)
    else:
        tmpdir = os.path.abspath("./raxmltemp")
    if not os.path.exists(tmpdir):
        os.mkdir(tmpdir)
    os.chdir(tmpdir)
    ## SETUP PARAMS
    main_labels = mvf.get_sample_labels(sample_cols)
    if args.hapmode in ["randomboth", "majorminor"]:
        main_labels = [label + x for x in ["a", "b"] for label in main_labels]
    params = {
        "outgroups": args.raxml_outgroups or [],
        "rootwith": args.rootwith or [],
        "minsites": args.minsites,
        "minseqcoverage": args.minseqcoverage,
        "mindepth": args.mindepth,
        "raxmlpath": args.raxmlpath,
        "raxmlopts": args.raxmlopts,
        "duplicateseq": args.duplicateseq,
        "model": args.raxml_model,
        "bootstrap": args.bootstrap,
        "windowsize": args.windowsize,
        "hapmode": args.hapmode,
        "tempdir": tmpdir,
        "tempprefix": args.tempprefix,
    }
    ## WINDOW START INTERATION
    current_contig = ""
    window_start = 0
    window = None
    topo_ids = {}
    topo_counts = {}
    for contig, pos, allelesets in mvf.iterentries(
        contigs=args.contigs,
        subset=sample_cols,
        quiet=args.quiet,
        no_invariant=False,
        no_ambig=False,
        no_gap=False,
        decode=True,
    ):
        if contig != current_contig or (args.windowsize != -1 and (pos > window_start + args.windowsize)):
            if window:
                entry = window.maketree_raxml(params)
                if entry["status"] != "ok":
                    if args.outputempty:
                        treefile.write_entry(entry)
                else:
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                window_start = (
                    (contig == current_contig and args.windowsize != -1) and window_start + args.windowsize or 0
                )
            current_contig = contig[:]
            window = None
            window = WindowData(
                window_params={
                    "contigname": (
                        args.outputcontiglabels and mvf.get_contig_label(current_contig) or current_contig[:]
                    ),
                    "windowstart": (args.windowsize == -1 and "-1" or window_start + 0),
                    "windowsize": args.windowsize,
                    "labels": main_labels[:],
                }
            )
        ## ADD ALLELES
        if args.hapmode != "none":
            allelesets[0] = hapsplit(allelesets[0], args.hapmode)
        window.append_alleles(allelesets[0], minsitedepth=args.minsitedepth)
    ## LAST LOOP
    entry = window.maketree_raxml(params)
    if entry["status"] != "ok":
        if args.outputempty:
            treefile.write_entry(entry)
    else:
        topo = entry["topology"]
        topo_counts[topo] = topo_counts.get(topo, 0) + 1
        if topo not in topo_ids:
            topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0
        entry["topoid"] = topo_ids[topo]
        treefile.write_entry(entry)
    window = None
    ## END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.iteritems()], reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({"rank": rank, "count": value, "topology": topo})
    return ""
Example #6
0
def main(arguments=sys.argv[1:]):
    """Main method for mvf2fasta"""
    parser = argparse.ArgumentParser(description="""
    Process MVF into FASTA alignment""")
    parser.add_argument("--mvf", help="input MVF file", required=True)
    parser.add_argument("--out", help="target FASTA file", required=True)
    parser.add_argument("--labeltype", choices=['long', 'short'],
                        default='long',
                        help="long labels with all metadata or short ids")
    parser.add_argument("--regions", nargs='*',
                        help="one or more regions id,start,stop (inclusive)")
    parser.add_argument("--samples", nargs='*',
                        help="one or more taxon labels, leave blank for all")
    parser.add_argument("--outgroups", nargs="*")
    parser.add_argument("--contigs", nargs='*',
                        help="one or more taxon labels, leave blank for all")
    parser.add_argument("--buffer", type=int, default=10,
                        help="size (Mbp) of write buffer for each sample")
    parser.add_argument("--tmpdir", default=".",
                        help="directory to write temporary fasta files")
    parser.add_argument("--quiet", action="store_true", default=True,
                        help="suppress screen output")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-01: Initial Public Release")
        sys.exit()
    mvf = MultiVariantFile(args.mvf, 'read')
    if args.contigs:
        contigs = dict(mvf.metadata['contigs'][c] for c in args.contigs)
    else:
        contigs = dict(mvf.metadata['contigs'])
    sample_cols = mvf.get_sample_indices(args.samples or None)
    labels = mvf.get_sample_labels(sample_cols)
    current_contig = None
    tmp_files = dict((fn, open(fn+'.tmp', 'w+', args.buffer)) for fn in labels)
    for contig, _, allelesets in mvf.iterentries(
            contigs=args.contigs, subset=sample_cols,
            quiet=args.quiet, decode=True):
        alleles = mvf.decode(allelesets)
        if current_contig != contig:
            current_contig = contig
            for col, label in zip(sample_cols, labels):
                if args.labeltype == 'long':
                    tmp_files[label].write(
                        '\n>{} contig={}  length={}\n{}'.format(
                            label,
                            contigs[current_contig]['label'],
                            contigs[current_contig]['length'],
                            alleles[col]))
                elif args.labeltype == 'short':
                    tmp_files[label].write(
                        '\n>{}_{}\n{}'.format(
                            label, contigs[current_contig]['label'],
                            alleles[col]))
        else:
            for col, label in zip(sample_cols, labels):
                tmp_files[label].write(alleles[col])
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while  len(buff):
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            filehandler.close()
            os.remove(os.path.join(args.tmpdir, filehandler.name))
    return ''