def beadplot_serial(lineage, features, args, callback=None): """ Compute distance matrices and reconstruct NJ trees """ # bootstrap sampling and NJ tree reconstruction, serial mode trees, labels = clustering.build_trees(features, args, callback=callback) if trees is None: # lineage only has one variant, no meaningful tree beaddict = {'lineage': lineage, 'nodes': {}, 'edges': []} # use earliest sample as variant label intermed = [label.split('|')[::-1] for label in labels[0]] intermed.sort() variant = intermed[0][1] beaddict['nodes'].update({variant: []}) for coldate, accn, label1 in intermed: beaddict['nodes'][variant].append([coldate, accn, label1]) return beaddict # generate majority consensus tree ctree = clustering.consensus(iter(trees), cutoff=args.boot_cutoff) # collapse polytomies and label internal nodes label_dict = dict([(str(idx), lst) for idx, lst in enumerate(labels)]) ctree = beadplot.annotate_tree(ctree, label_dict, callback=callback) # convert to JSON format beaddict = beadplot.serialize_tree(ctree) beaddict.update({'lineage': lineage}) return beaddict
help="output, path to file to write JSON, defaults to stdout") parser.add_argument("--boot", action="store_true", help="option, indicates that input file contains bootstrap trees") parser.add_argument("--cutoff", type=float, default=0.5, help="option, if user sets --boot, specifies bootstrap support " "threshold parameter (default 0.5)") parser.add_argument("--minlen", type=float, default=0.5, help="option, minimum branch length. Branches below this cutoff " "are collapsed into polytomies (default 0.5).") return parser.parse_args() if __name__ == "__main__": args = parse_args() if args.boot: trees = Phylo.parse(args.tree, 'newick') ctree = consensus(trees, cutoff=args.cutoff) else: try: ctree = Phylo.read(args.tree, 'newick') except: print("Detected multiple trees in file, handling as bootstrap") trees = Phylo.parse(args.tree, 'newick') ctree = consensus(trees, cutoff=args.cutoff) # sequence labels keyed by integers mapping to tips label_dict = parse_labels(args.labels) tree = annotate_tree(ctree, label_dict, minlen=args.minlen) obj = serialize_tree(tree) args.outfile.write(json.dumps(obj, indent=2))
def make_beadplots(by_lineage, args, callback=None, t0=None): """ Wrapper for beadplot_serial - divert to clustering.py in MPI mode if lineage has too many genomes. :param by_lineage: dict, feature vectors stratified by lineage :param args: Namespace, from argparse.ArgumentParser() :param t0: float, datetime.timestamp. :return: list, beadplot data by lineage """ result = [] for lineage, features in by_lineage.items(): if callback: callback('start {}, {} entries'.format(lineage, len(features))) if len(features) < args.mincount: # serial processing if len(features) == 0: continue # empty lineage, skip (should never happen) beaddict = beadplot_serial(lineage, features, args) else: # call out to MPI cmd = [ "mpirun", "--machinefile", args.machine_file, "python3", "covizu/clustering.py", args.bylineage, lineage, # positional arguments <JSON file>, <str> "--nboot", str(args.nboot), "--outdir", "data" ] if t0: cmd.extend(["--timestamp", str(t0)]) subprocess.check_call(cmd) # import trees outfile = open('data/{}.nwk'.format(lineage)) trees = Phylo.parse(outfile, 'newick') # note this returns a generator # import label map with open('data/{}.labels.csv'.format(lineage)) as handle: label_dict = import_labels(handle) # generate beadplot data ctree = clustering.consensus(trees, cutoff=args.boot_cutoff, callback=callback) outfile.close() # done with Phylo.parse generator ctree = beadplot.annotate_tree(ctree, label_dict) beaddict = beadplot.serialize_tree(ctree) beaddict.update({'lineage': lineage}) result.append(beaddict) return result
def make_beadplots(by_lineage, args, callback=None, t0=None, txtfile='minor_lineages.txt', recode_file="recoded.json"): """ Wrapper for beadplot_serial - divert to clustering.py in MPI mode if lineage has too many genomes. :param by_lineage: dict, feature vectors stratified by lineage :param args: Namespace, from argparse.ArgumentParser() :param t0: float, datetime.timestamp. :param txtfile: str, path to file to write minor lineage names :param recode_file: str, path to JSON file to write recoded lineage data :return: list, beadplot data by lineage """ # recode data into variants and serialize if callback: callback("Recoding features, compressing variants..") recoded = {} for lineage, records in by_lineage.items(): union, labels, indexed = clustering.recode_features( records, limit=args.max_variants) # serialize tuple keys (features of union), #335 union = dict([("{0}|{1}|{2}".format(*feat), idx) for feat, idx in union.items()]) indexed = [list(s) for s in indexed] # sets cannot be serialized to JSON, #335 recoded.update( {lineage: { 'union': union, 'labels': labels, 'indexed': indexed }}) with open(recode_file, 'w') as handle: json.dump(recoded, handle) # partition lineages into major and minor categories intermed = [(len(features), lineage) for lineage, features in by_lineage.items() if len(features) < args.mincount] intermed.sort(reverse=True) # descending order minor = dict([(lineage, None) for _, lineage in intermed if lineage is not None]) # export minor lineages to text file with open(txtfile, 'w') as handle: for lineage in minor: handle.write('{}\n'.format(lineage)) # launch MPI job across minor lineages if callback: callback("start MPI on minor lineages") cmd = [ "mpirun", "--machinefile", args.machine_file, "python3", "covizu/clustering.py", recode_file, txtfile, # positional arguments <JSON file>, <str> "--mode", "flat", "--max-variants", str(args.max_variants), "--nboot", str(args.nboot), "--outdir", args.outdir, "--binpath", args.binpath # RapidNJ ] if t0: cmd.extend(["--timestamp", str(t0)]) subprocess.check_call(cmd) # process major lineages for lineage, features in by_lineage.items(): if lineage in minor: continue if callback: callback('start {}, {} entries'.format(lineage, len(features))) cmd = [ "mpirun", "--machinefile", args.machine_file, "python3", "covizu/clustering.py", recode_file, lineage, # positional arguments <JSON file>, <str> "--mode", "deep", "--max-variants", str(args.max_variants), "--nboot", str(args.nboot), "--outdir", args.outdir, "--binpath", args.binpath ] if t0: cmd.extend(["--timestamp", str(t0)]) subprocess.check_call(cmd) # parse output files if callback: callback("Parsing output files") result = [] for lineage in recoded: # import trees lineage_name = lineage.replace('/', '_') # issue #297 outfile = open('data/{}.nwk'.format(lineage_name)) trees = Phylo.parse(outfile, 'newick') # note this returns a generator label_dict = recoded[lineage]['labels'] if len(label_dict) == 1: # handle case of only one variant # lineage only has one variant, no meaningful tree beaddict = {'nodes': {}, 'edges': []} # use earliest sample as variant label intermed = [label.split('|')[::-1] for label in label_dict['0']] intermed.sort() variant = intermed[0][1] beaddict['nodes'].update({variant: []}) for coldate, accn, label1 in intermed: beaddict['nodes'][variant].append([coldate, accn, label1]) else: # generate beadplot data ctree = clustering.consensus(trees, cutoff=args.boot_cutoff, callback=callback) outfile.close() # done with Phylo.parse generator ctree = beadplot.annotate_tree(ctree, label_dict) beaddict = beadplot.serialize_tree(ctree) beaddict.update({'sampled_variants': len(label_dict)}) beaddict.update({'lineage': lineage}) result.append(beaddict) return result
intermed.sort() variant = intermed[0][1] beaddict['nodes'].update({variant: []}) for coldate, accn, label1 in intermed: beaddict['nodes'][variant].append({ 'accession': accn, 'label1': label1, 'country': label1.split('/')[1], 'coldate': coldate }) result.append(beaddict) continue # generate majority consensus tree ctree = clustering.consensus(trees, cutoff=args.cutoff) # collapse polytomies and label internal nodes label_dict = dict([(str(idx), lst) for idx, lst in enumerate(labels)]) ctree = beadplot.annotate_tree(ctree, label_dict) # convert to JSON format beaddict = beadplot.serialize_tree(ctree) beaddict.update({'lineage': lineage}) result.append(beaddict) args.outfile.write(json.dumps(result, indent=2))