def main(): parser = argparse.ArgumentParser(description=""" Read in tree, read in samples from sampleFile, attempt to match tree's leaf IDs with samples, prune tree to only branches with leaves found in sampleFile, output pruned tree with sample IDs. """ ) parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain') parser.add_argument('sampleFile', help='File with one sample ID per line') args = parser.parse_args() # Very large, deeply nested trees can exceed the default recursion limit of 1000. sys.setrecursionlimit(100000) # logging.basicConfig(level=logging.DEBUG, filename='debug.log') tree = newick.parseFile(args.treeFile) samples = utils.listFromFile(args.sampleFile) idLookup = virusNames.makeIdLookup(samples) for key in idLookup: values = idLookup[key] if (len(values) != 1): logging.warn('Duplicate name/component in ' + args.sampleFile + ': ' + key + " -> " + ", ".join(values)) foundSampleSet = set() tree = newick.treeIntersectIds(tree, idLookup, foundSampleSet, virusNames.lookupSeqName) newick.printTree(tree) if (len(foundSampleSet) < len(samples)): logging.warn("%s has %d samples but pruned tree has %d leaves (%d samples not found)" % (args.sampleFile, len(samples), len(foundSampleSet), len(samples) - len(foundSampleSet))) allSampleSet = set(samples) sampleFileNotTree = allSampleSet - foundSampleSet logging.warn("Example samples not found:\n" + "\n".join(random.sample(sampleFileNotTree, 10)))
def main(): parser = argparse.ArgumentParser(description=""" Read sample names from sampleFile. Read sample IDs that are a concatenation of EPI ID, sample name and approximate date, for resolving sampleFile IDs and lineageFile IDs, from a VCF file. Read lineage assignments from lineageFile. Write out 3 tab-sep columns: sample, lineage, lineageColor. """ ) parser.add_argument('sampleFile', help='File containing sample IDs') parser.add_argument('vcfFile', help='VCF file with genotype columns for the sample samples') parser.add_argument('lineageFile', help='Two-column tab-sep file mapping sample to lineage') args = parser.parse_args() samples = utils.listFromFile(args.sampleFile) vcfSamples = vcf.readSamples(args.vcfFile) idLookup = virusNames.makeIdLookup(vcfSamples) lineages = utils.dictFromFile(args.lineageFile) nsLineages = dict([ (virusNames.maybeLookupSeqName(name, idLookup), lin) for name, lin in lineages.items() ]) for sample in samples: nsSample = virusNames.maybeLookupSeqName(sample, idLookup) lineage = nsLineages.get(nsSample) if (not lineage): lineage = '' color = "#%06x" % (lineageColors.lineageToColor(lineage)) print('\t'.join([sample, lineage, color]))
def main(): parser = argparse.ArgumentParser(description=""" Read in tree, read in samples from VCF, attempt to match tree's leaf IDs with VCF IDs, prune tree to only branches with leaves found in VCF, output pruned tree with VCF IDs. """) parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain') parser.add_argument('vcfFile', help='VCF file with IDs similar to Nextstrain') args = parser.parse_args() # Very large, deeply nested trees can exceed the default recursion limit of 1000. sys.setrecursionlimit(100000) # logging.basicConfig(level=logging.DEBUG, filename='intersect.log') tree = newick.parseFile(args.treeFile) vcfSamples = vcf.readSamples(args.vcfFile) idLookup = virusNames.makeIdLookup(vcfSamples) badKeys = [] for key in idLookup: values = idLookup[key] if (len(values) > 3): badKeys.append(key) elif (len(values) != 1): logging.warn('Duplicate name/component in VCF: ' + key + " -> " + ", ".join(values)) for key in badKeys: del idLookup[key] sampleSet = set() tree = newick.treeIntersectIds(tree, idLookup, sampleSet, virusNames.lookupSeqName) newick.printTree(tree) if (len(sampleSet) < len(vcfSamples)): logging.warn( "VCF has %d samples but pruned tree has %d leaves (%d VCF samples not found)" % (len(vcfSamples), len(sampleSet), len(vcfSamples) - len(sampleSet))) vcfSampleSet = set(vcfSamples) vcfNotTree = vcfSampleSet - sampleSet logging.warn("Example VCF samples not found:\n" + "\n".join(random.sample(vcfNotTree, 10))) vcfOutName = 'intersected.vcf' logging.warn("Writing VCF to " + vcfOutName) vcf.pruneToSamples(args.vcfFile, sampleSet, vcfOutName)
def main(): parser = argparse.ArgumentParser(description=""" Read in VCF, read in samples from sampleFile, attempt to match VCF IDs with samples, remove VCF genotype columns for samples not found in sampleFile, output VCF with updated AC and AN counts. """) parser.add_argument('vcfFile', help='VCF file with sample genotype columns') parser.add_argument('sampleFile', help='File with one sample ID per line') parser.add_argument('vcfOutFile', help='VCF output file with only samples in sampleFile') args = parser.parse_args() samples = utils.listFromFile(args.sampleFile) idLookup = virusNames.makeIdLookup(samples) for key in idLookup: values = idLookup[key] if (len(values) != 1): logging.warn('Duplicate name/component in ' + args.sampleFile + ': ' + key + " -> " + ", ".join(values)) vcfSamples = vcf.readSamples(args.vcfFile) foundSampleSet = set([ sample for sample in vcfSamples if virusNames.lookupSeqName(sample, idLookup) ]) vcf.pruneToSamples(args.vcfFile, foundSampleSet, args.vcfOutFile) if (len(foundSampleSet) < len(samples)): logging.warn( "%s has %d samples but pruned VCF has %d samples (%d samples not found)" % (args.sampleFile, len(samples), len(foundSampleSet), len(samples) - len(foundSampleSet))) allSampleSet = set([ virusNames.maybeLookupSeqName(sample, idLookup) for sample in samples ]) sampleFileNotVcf = allSampleSet - foundSampleSet logging.warn("Example samples not found:\n" + "\n".join(random.sample(sampleFileNotVcf, 10)))
def main(): parser = argparse.ArgumentParser(description=""" Read samples and clade assignments from a Nextstrain VCF file. Read lineage assignments from lineageFile. Write out 3 tab-sep columns: NS sample ID, clade, lineage. """) parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data') parser.add_argument( 'lineageFile', help='Two-column tab-sep file mapping sample to lineage') args = parser.parse_args() (vcfSamples, vcfSampleClades) = nextstrainVcf.readVcfSampleClades(args.vcfFile) idLookup = virusNames.makeIdLookup(vcfSamples) lineages = utils.dictFromFile(args.lineageFile) nsLineages = dict([(virusNames.maybeLookupSeqName(name, idLookup), lin) for name, lin in lineages.items()]) for sample, clade in vcfSampleClades.items(): lineage = nsLineages.get(sample) if (not lineage): lineage = '' print('\t'.join([sample, clade, lineage]))
def main(): parser = argparse.ArgumentParser(description=""" Read in tree, read in samples and lineages, attempt to match tree's leaf IDs with lineage IDs, add colors corresponding to lineages. """) parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain') parser.add_argument( 'lineageFile', help='Two-column tab-sep file mapping sample to lineage') args = parser.parse_args() # Very large, deeply nested trees can exceed the default recursion limit of 1000. sys.setrecursionlimit(100000) tree = newick.parseFile(args.treeFile) sampleLineages = utils.dictFromFile(args.lineageFile) treeNames = newick.leafNames(tree) idLookup = virusNames.makeIdLookup(treeNames) treeLineages = dict([(virusNames.maybeLookupSeqName(name, idLookup), lin) for name, lin in sampleLineages.items()]) noLinCount = lineageColors.addLineagesAsBogusLength(tree, treeLineages) if (noLinCount): logging.warn("%d samples had no lineage in %s" % (noLinCount, args.lineageFile)) newick.printTree(tree)
def main(): parser = argparse.ArgumentParser(description=""" Read tree from Newick treeFile. Read sample IDs that are a concatenation of EPI ID, sample name and approximate date, for resolving sampleFile IDs and lineageFile IDs, from a VCF file. Read lineage assignments from lineageFile. Figure out what lineage and color (if any) are assigned to each leaf, and then work back towards root assigning color to each named node whose descendants all have same color. Write out 3 tab-sep columns: sampleOrNode, lineage, lineageColor. """ ) parser.add_argument('treeFile', help='Newick tree whose leaf labels are sample IDs') parser.add_argument('vcfFile', help='VCF file with genotype columns for the sample samples') parser.add_argument('lineageFile', help='Two-column tab-sep file mapping sample to lineage') args = parser.parse_args() tree = newick.parseFile(args.treeFile) vcfSamples = vcf.readSamples(args.vcfFile) idLookup = virusNames.makeIdLookup(vcfSamples) lineages = utils.dictFromFile(args.lineageFile) nsLineages = dict([ (virusNames.maybeLookupSeqName(name, idLookup), lin) for name, lin in lineages.items() ]) assignColors(tree, idLookup, nsLineages)