Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description="""
Read in tree, read in samples from sampleFile, attempt to match tree's leaf IDs with samples,
prune tree to only branches with leaves found in sampleFile, output pruned tree with sample IDs.
"""
    )
    parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain')
    parser.add_argument('sampleFile', help='File with one sample ID per line')
    args = parser.parse_args()
    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
    sys.setrecursionlimit(100000)
    # logging.basicConfig(level=logging.DEBUG, filename='debug.log')
    tree = newick.parseFile(args.treeFile)
    samples = utils.listFromFile(args.sampleFile)
    idLookup = virusNames.makeIdLookup(samples)
    for key in idLookup:
        values = idLookup[key]
        if (len(values) != 1):
            logging.warn('Duplicate name/component in ' + args.sampleFile + ': ' + key + " -> " +
                         ", ".join(values))
    foundSampleSet = set()
    tree = newick.treeIntersectIds(tree, idLookup, foundSampleSet, virusNames.lookupSeqName)
    newick.printTree(tree)
    if (len(foundSampleSet) < len(samples)):
        logging.warn("%s has %d samples but pruned tree has %d leaves (%d samples not found)" %
                     (args.sampleFile, len(samples), len(foundSampleSet),
                      len(samples) - len(foundSampleSet)))
        allSampleSet = set(samples)
        sampleFileNotTree = allSampleSet - foundSampleSet
        logging.warn("Example samples not found:\n" +
                     "\n".join(random.sample(sampleFileNotTree, 10)))
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(description="""
Read in tree, read samples, find branch of tree that has all of the samples as leaves,
and write out that branch as a new tree.  All samples must exactly match leaf names
and all must be found.
"""
    )
    parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain')
    parser.add_argument('sampleFile', help='File with one sample ID per line')
    args = parser.parse_args()
    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
    sys.setrecursionlimit(100000)
    # logging.basicConfig(level=logging.DEBUG, filename='debug.log')
    tree = newick.parseFile(args.treeFile)
    samples = utils.listFromFile(args.sampleFile)
    branch = treeBranchWithSamples(tree, samples)
    newick.printTree(branch)
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description="""
Read in tree, read in samples from VCF, attempt to match tree's leaf IDs with VCF IDs,
prune tree to only branches with leaves found in VCF, output pruned tree with VCF IDs.
""")
    parser.add_argument('treeFile',
                        help='Newick file with IDs similar to Nextstrain')
    parser.add_argument('vcfFile',
                        help='VCF file with IDs similar to Nextstrain')
    args = parser.parse_args()
    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
    sys.setrecursionlimit(100000)
    # logging.basicConfig(level=logging.DEBUG, filename='intersect.log')
    tree = newick.parseFile(args.treeFile)
    vcfSamples = vcf.readSamples(args.vcfFile)
    idLookup = virusNames.makeIdLookup(vcfSamples)
    badKeys = []
    for key in idLookup:
        values = idLookup[key]
        if (len(values) > 3):
            badKeys.append(key)
        elif (len(values) != 1):
            logging.warn('Duplicate name/component in VCF: ' + key + " -> " +
                         ", ".join(values))
    for key in badKeys:
        del idLookup[key]
    sampleSet = set()
    tree = newick.treeIntersectIds(tree, idLookup, sampleSet,
                                   virusNames.lookupSeqName)
    newick.printTree(tree)
    if (len(sampleSet) < len(vcfSamples)):
        logging.warn(
            "VCF has %d samples but pruned tree has %d leaves (%d VCF samples not found)"
            % (len(vcfSamples), len(sampleSet),
               len(vcfSamples) - len(sampleSet)))
        vcfSampleSet = set(vcfSamples)
        vcfNotTree = vcfSampleSet - sampleSet
        logging.warn("Example VCF samples not found:\n" +
                     "\n".join(random.sample(vcfNotTree, 10)))
        vcfOutName = 'intersected.vcf'
        logging.warn("Writing VCF to " + vcfOutName)
        vcf.pruneToSamples(args.vcfFile, sampleSet, vcfOutName)
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(description="""
Read in tree, read in samples and lineages, attempt to match tree's leaf IDs with lineage IDs,
add colors corresponding to lineages.
""")
    parser.add_argument('treeFile',
                        help='Newick file with IDs similar to Nextstrain')
    parser.add_argument(
        'lineageFile',
        help='Two-column tab-sep file mapping sample to lineage')
    args = parser.parse_args()
    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
    sys.setrecursionlimit(100000)
    tree = newick.parseFile(args.treeFile)
    sampleLineages = utils.dictFromFile(args.lineageFile)
    treeNames = newick.leafNames(tree)
    idLookup = virusNames.makeIdLookup(treeNames)
    treeLineages = dict([(virusNames.maybeLookupSeqName(name, idLookup), lin)
                         for name, lin in sampleLineages.items()])
    noLinCount = lineageColors.addLineagesAsBogusLength(tree, treeLineages)
    if (noLinCount):
        logging.warn("%d samples had no lineage in %s" %
                     (noLinCount, args.lineageFile))
    newick.printTree(tree)