Exemple #1
0
def computeFit(options):
    if options.tree is not None:
        tree = options.tree
    else:
        tree = getHalTree(options.hal)

    cmd = "phyloFit --tree \"%s\" --subst-mod %s --sym-freqs %s --out-root %s" % (
        tree, options.substMod, options.outMafSS, 
        os.path.splitext(options.outMod)[0])
    if options.error is not None:
        cmd += " --error %s " % options.error
    runShellCommand(cmd)
    runShellCommand("rm -f %s" % options.outMafSS)
Exemple #2
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Compute a neutral substitution model for use with "
        "phyloP or halPhlyoP")

    parser.add_argument("hal", help="input hal")
    parser.add_argument("refGenome", help="Name of reference genome")
    parser.add_argument("bedDir", help="BED file or directory containing BED "
                        "files.  By "
                        "default, these files are interpreted to contain only"
                        " coordinates of coding exons, and fourfold degenerate"
                        " sites will automatically be extracted from them."
                        " To disable this behaviour and train on the entire "
                        " file, use the --no4d option.", default=None)
    parser.add_argument("outMod", help="Path to output model file")
    parser.add_argument("--no4d", help="Do not extract fourfold degenerate"
                        " positions from the input bed files.  Rather use "
                        "all bases they contain.",
                        default=False, action="store_true")
    parser.add_argument("--numProc",
                        help="Maximum number of processes for hal2maf.",
                        type=int, default=1)
    parser.add_argument("--noAncestors",
                        help="Don't write ancestral genomes in hal2maf",
                        action="store_true", default=False)
    parser.add_argument("--maxBedLines",
                        help="Split bed files so they have at most this many"
                        " lines",
                        type=int, default=None)
    parser.add_argument("--sliceSize",
                        help="Slice size for hal2maf.",
                        type=int, default=None)
    parser.add_argument("--tree",
                        help="String describing phylogeny in NEWICK format "
                        "that will be used instead of the tree stored in the"
                        " HAL file.  This tree should contain all the species"
                        " in the alignment. Note that it is best to enclose"
                        " this string in quotes",
                        default=None)
    parser.add_argument("--targetGenomes", default=None, nargs='+',
                        help="space separated list of targetGenomes to pass to "
                        "hal2maf. If used, the tree given to --tree should match.")
    parser.add_argument("--substMod", help="Substitution model for phyloFit"
                        ": valid options are JC69|F81|HKY85|HKY85+Gap|REV|"
                        "SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S",
                        default = "SSREV")
    parser.add_argument("--noModFreqs", help="By default, equilibrium "
                        "frequencies for the nucleotides of the trained model"
                        " are corrected with the observed frequencies of "
                        "the reference genome (using the PHAST modFreqs"
                        " tool.  This flag disables this step, and keeps the"
                        " trained frequencies", action="store_true",
                        default=False)
    parser.add_argument("--error", help="File in which to output confidence"
                        " intervals for the parameters in the model",
                        default=None)
    args = parser.parse_args()

    # validate inputs
    if not os.path.isfile(args.hal):
        raise RuntimeError("Input hal file %s not found" % args.hal)
    if not os.path.exists(args.bedDir):
        raise RuntimeError("%s not found" % args.bedDir)

    # validarte substitution model
    if not args.substMod in "JC69|F81|HKY85|HKY85+Gap|REV|SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S".split("|"):
        raise RuntimeError("Invalid substitution model: %s" % args.substMod)

    # validate BEDs
    if os.path.isdir(args.bedDir):
        args.bedFiles = [os.path.join(args.bedDir, f) for f
                         in os.listdir(args.bedDir)
                         if os.path.isfile(os.path.join(args.bedDir, f))]
    else:
        args.bedFiles = [args.bedDir]

    # test output is writeable and has valid extension
    outTest = open(args.outMod, "w")
    if not outTest:
        raise RuntimeError("Unable to open output %s" % args.outMod)
    if os.path.splitext(args.outMod)[1] != ".mod":
        raise RuntimeError("Output model must have .mod extension")

    # if targetGenomes is set, use those. Otherwise, extract from HAL
    if args.targetGenomes is not None:
        args.halGenomes = args.targetGenomes
    else:
        args.halGenomes = getHalGenomes(args.hal)

    # if tree is set, use that. Otherwise, extract from HAL
    if args.tree is None:
        args.tree = getHalTree(args.hal)

    # Make sure that all members of halGenomes and tree are in the actual HAL
    halTree = getHalTree(args.hal)
    if args.refGenome not in halTree:
        raise RuntimeError("Reference genome %s not found." % args.refGenome)
    for targetGenome in args.halGenomes:
        if targetGenome not in halTree:
            raise RuntimeError("Target genome %s not in HAL." % targetGenome)
        if targetGenome not in args.tree:
            raise RuntimeError("Target genome %s not in --tree." % targetGenome)
    args.halGenomes = ','.join(args.halGenomes)

    args.outDir = os.path.dirname(args.outMod)
    args.outName = os.path.splitext(os.path.basename(args.outMod))[0]
    args.outMafName = args.outName + "_halPhyloPTrain_temp.maf"
    args.outMafPath = os.path.join(args.outDir, args.outMafName)
    args.outMafAllPaths = args.outMafPath.replace("_halPhyloPTrain_temp.maf",
                                                  "_halPhyloPTrain_temp*.maf")
    args.outMafSS = args.outMafPath.replace("_halPhyloPTrain_temp.maf",
                                            "_halPhyloPTrain_temp.ss")
    computeModel(args)
Exemple #3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Compute a neutral substitution model for use with "
        "phyloP or halPhlyoP")

    parser.add_argument("hal", help="input hal")
    parser.add_argument("refGenome", help="Name of reference genome")
    parser.add_argument("bedDir",
                        help="BED file or directory containing BED "
                        "files.  By "
                        "default, these files are interpreted to contain only"
                        " coordinates of coding exons, and fourfold degenerate"
                        " sites will automatically be extracted from them."
                        " To disable this behaviour and train on the entire "
                        " file, use the --no4d option.",
                        default=None)
    parser.add_argument("outMod", help="Path to output model file")
    parser.add_argument("--no4d",
                        help="Do not extract fourfold degenerate"
                        " positions from the input bed files.  Rather use "
                        "all bases they contain.",
                        default=False,
                        action="store_true")
    parser.add_argument("--numProc",
                        help="Maximum number of processes for hal2maf.",
                        type=int,
                        default=1)
    parser.add_argument("--noAncestors",
                        help="Don't write ancestral genomes in hal2maf",
                        action="store_true",
                        default=False)
    parser.add_argument("--maxBedLines",
                        help="Split bed files so they have at most this many"
                        " lines",
                        type=int,
                        default=None)
    parser.add_argument("--tree",
                        help="String describing phylogeny in NEWICK format "
                        "that will be used instead of the tree stored in the"
                        " HAL file.  This tree should contain all the species"
                        " in the alignment. Note that it is best to enclose"
                        " this string in quotes",
                        default=None)
    parser.add_argument(
        "--targetGenomes",
        default=None,
        nargs='+',
        help="space separated list of targetGenomes to pass to "
        "hal2maf. If used, the tree given to --tree should match.")
    parser.add_argument("--substMod",
                        help="Substitution model for phyloFit"
                        ": valid options are JC69|F81|HKY85|HKY85+Gap|REV|"
                        "SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S",
                        default="SSREV")
    parser.add_argument("--noModFreqs",
                        help="By default, equilibrium "
                        "frequencies for the nucleotides of the trained model"
                        " are corrected with the observed frequencies of "
                        "the reference genome (using the PHAST modFreqs"
                        " tool.  This flag disables this step, and keeps the"
                        " trained frequencies",
                        action="store_true",
                        default=False)
    parser.add_argument("--precision",
                        help="Precision to pass to phyloFit (default MED)",
                        choices=["HIGH", "MED", "LOW"],
                        default="MED")
    parser.add_argument("--error",
                        help="File in which to output confidence"
                        " intervals for the parameters in the model",
                        default=None)
    args = parser.parse_args()

    # validate inputs
    if not os.path.isfile(args.hal):
        raise RuntimeError("Input hal file %s not found" % args.hal)
    if not os.path.exists(args.bedDir):
        raise RuntimeError("%s not found" % args.bedDir)

    # validarte substitution model
    if not args.substMod in "JC69|F81|HKY85|HKY85+Gap|REV|SSREV|UNREST|R2|R2S|U2|U2S|R3|R3S|U3|U3S".split(
            "|"):
        raise RuntimeError("Invalid substitution model: %s" % args.substMod)

    # validate BEDs
    if os.path.isdir(args.bedDir):
        args.bedFiles = [
            os.path.join(args.bedDir, f) for f in os.listdir(args.bedDir)
            if os.path.isfile(os.path.join(args.bedDir, f))
        ]
    else:
        args.bedFiles = [args.bedDir]

    # test output is writeable and has valid extension
    outTest = open(args.outMod, "w")
    if not outTest:
        raise RuntimeError("Unable to open output %s" % args.outMod)
    if os.path.splitext(args.outMod)[1] != ".mod":
        raise RuntimeError("Output model must have .mod extension")

    # if targetGenomes is set, use those. Otherwise, extract from HAL
    if args.targetGenomes is not None:
        args.halGenomes = args.targetGenomes
    else:
        args.halGenomes = getHalGenomes(args.hal)

    # if tree is set, use that. Otherwise, extract from HAL
    if args.tree is None:
        args.tree = getHalTree(args.hal)

    # Make sure that all members of halGenomes and tree are in the actual HAL
    halTree = getHalTree(args.hal)
    if args.refGenome not in halTree:
        raise RuntimeError("Reference genome %s not found." % args.refGenome)
    for targetGenome in args.halGenomes:
        if targetGenome not in halTree:
            raise RuntimeError("Target genome %s not in HAL." % targetGenome)
        if targetGenome not in args.tree:
            raise RuntimeError("Target genome %s not in --tree." %
                               targetGenome)
    args.halGenomes = ','.join(args.halGenomes)

    args.outDir = os.path.dirname(args.outMod)
    args.outName = os.path.splitext(os.path.basename(args.outMod))[0]
    # Random suffix so two runs don't collide
    suffix = "".join(
        [random.choice(string.ascii_uppercase) for _ in xrange(7)])
    args.outMafName = args.outName + "_halPhyloPTrain_temp_%s.maf" % suffix
    args.outMafPath = os.path.join(args.outDir, args.outMafName)
    args.outMafAllPaths = args.outMafPath.replace(
        "_halPhyloPTrain_temp_%s.maf" % suffix,
        "_halPhyloPTrain_temp_%s*.maf" % suffix)
    # replace .maf suffix with .ss
    args.outMafSS = args.outMafPath[:-4] + ".ss"
    computeModel(args)