Exemple #1
0
def beadplot_serial(lineage, features, args, callback=None):
    """ Compute distance matrices and reconstruct NJ trees """
    # bootstrap sampling and NJ tree reconstruction, serial mode
    trees, labels = clustering.build_trees(features, args, callback=callback)
    if trees is None:
        # lineage only has one variant, no meaningful tree
        beaddict = {'lineage': lineage, 'nodes': {}, 'edges': []}

        # use earliest sample as variant label
        intermed = [label.split('|')[::-1] for label in labels[0]]
        intermed.sort()
        variant = intermed[0][1]
        beaddict['nodes'].update({variant: []})

        for coldate, accn, label1 in intermed:
            beaddict['nodes'][variant].append([coldate, accn, label1])
        return beaddict

    # generate majority consensus tree
    ctree = clustering.consensus(iter(trees), cutoff=args.boot_cutoff)

    # collapse polytomies and label internal nodes
    label_dict = dict([(str(idx), lst) for idx, lst in enumerate(labels)])
    ctree = beadplot.annotate_tree(ctree, label_dict, callback=callback)

    # convert to JSON format
    beaddict = beadplot.serialize_tree(ctree)
    beaddict.update({'lineage': lineage})
    return beaddict
Exemple #2
0
def make_beadplots(by_lineage, args, callback=None, t0=None):
    """
    Wrapper for beadplot_serial - divert to clustering.py in MPI mode if
    lineage has too many genomes.

    :param by_lineage:  dict, feature vectors stratified by lineage
    :param args:  Namespace, from argparse.ArgumentParser()
    :param t0:  float, datetime.timestamp.
    :return:  list, beadplot data by lineage
    """
    result = []
    for lineage, features in by_lineage.items():
        if callback:
            callback('start {}, {} entries'.format(lineage, len(features)))

        if len(features) < args.mincount:
            # serial processing
            if len(features) == 0:
                continue  # empty lineage, skip (should never happen)
            beaddict = beadplot_serial(lineage, features, args)
        else:
            # call out to MPI
            cmd = [
                "mpirun",
                "--machinefile",
                args.machine_file,
                "python3",
                "covizu/clustering.py",
                args.bylineage,
                lineage,  # positional arguments <JSON file>, <str>
                "--nboot",
                str(args.nboot),
                "--outdir",
                "data"
            ]
            if t0:
                cmd.extend(["--timestamp", str(t0)])
            subprocess.check_call(cmd)

            # import trees
            outfile = open('data/{}.nwk'.format(lineage))
            trees = Phylo.parse(outfile,
                                'newick')  # note this returns a generator

            # import label map
            with open('data/{}.labels.csv'.format(lineage)) as handle:
                label_dict = import_labels(handle)

            # generate beadplot data
            ctree = clustering.consensus(trees,
                                         cutoff=args.boot_cutoff,
                                         callback=callback)
            outfile.close()  # done with Phylo.parse generator

            ctree = beadplot.annotate_tree(ctree, label_dict)
            beaddict = beadplot.serialize_tree(ctree)

        beaddict.update({'lineage': lineage})
        result.append(beaddict)

    return result
Exemple #3
0
def make_beadplots(by_lineage,
                   args,
                   callback=None,
                   t0=None,
                   txtfile='minor_lineages.txt',
                   recode_file="recoded.json"):
    """
    Wrapper for beadplot_serial - divert to clustering.py in MPI mode if
    lineage has too many genomes.

    :param by_lineage:  dict, feature vectors stratified by lineage
    :param args:  Namespace, from argparse.ArgumentParser()
    :param t0:  float, datetime.timestamp.
    :param txtfile:  str, path to file to write minor lineage names
    :param recode_file:  str, path to JSON file to write recoded lineage data

    :return:  list, beadplot data by lineage
    """

    # recode data into variants and serialize
    if callback:
        callback("Recoding features, compressing variants..")
    recoded = {}
    for lineage, records in by_lineage.items():
        union, labels, indexed = clustering.recode_features(
            records, limit=args.max_variants)

        # serialize tuple keys (features of union), #335
        union = dict([("{0}|{1}|{2}".format(*feat), idx)
                      for feat, idx in union.items()])
        indexed = [list(s)
                   for s in indexed]  # sets cannot be serialized to JSON, #335
        recoded.update(
            {lineage: {
                'union': union,
                'labels': labels,
                'indexed': indexed
            }})

    with open(recode_file, 'w') as handle:
        json.dump(recoded, handle)

    # partition lineages into major and minor categories
    intermed = [(len(features), lineage)
                for lineage, features in by_lineage.items()
                if len(features) < args.mincount]
    intermed.sort(reverse=True)  # descending order
    minor = dict([(lineage, None) for _, lineage in intermed
                  if lineage is not None])

    # export minor lineages to text file
    with open(txtfile, 'w') as handle:
        for lineage in minor:
            handle.write('{}\n'.format(lineage))

    # launch MPI job across minor lineages
    if callback:
        callback("start MPI on minor lineages")
    cmd = [
        "mpirun",
        "--machinefile",
        args.machine_file,
        "python3",
        "covizu/clustering.py",
        recode_file,
        txtfile,  # positional arguments <JSON file>, <str>
        "--mode",
        "flat",
        "--max-variants",
        str(args.max_variants),
        "--nboot",
        str(args.nboot),
        "--outdir",
        args.outdir,
        "--binpath",
        args.binpath  # RapidNJ
    ]
    if t0:
        cmd.extend(["--timestamp", str(t0)])
    subprocess.check_call(cmd)

    # process major lineages
    for lineage, features in by_lineage.items():
        if lineage in minor:
            continue

        if callback:
            callback('start {}, {} entries'.format(lineage, len(features)))

        cmd = [
            "mpirun",
            "--machinefile",
            args.machine_file,
            "python3",
            "covizu/clustering.py",
            recode_file,
            lineage,  # positional arguments <JSON file>, <str>
            "--mode",
            "deep",
            "--max-variants",
            str(args.max_variants),
            "--nboot",
            str(args.nboot),
            "--outdir",
            args.outdir,
            "--binpath",
            args.binpath
        ]
        if t0:
            cmd.extend(["--timestamp", str(t0)])
        subprocess.check_call(cmd)

    # parse output files
    if callback:
        callback("Parsing output files")
    result = []
    for lineage in recoded:
        # import trees
        lineage_name = lineage.replace('/', '_')  # issue #297
        outfile = open('data/{}.nwk'.format(lineage_name))
        trees = Phylo.parse(outfile, 'newick')  # note this returns a generator

        label_dict = recoded[lineage]['labels']

        if len(label_dict) == 1:
            # handle case of only one variant
            # lineage only has one variant, no meaningful tree
            beaddict = {'nodes': {}, 'edges': []}

            # use earliest sample as variant label
            intermed = [label.split('|')[::-1] for label in label_dict['0']]
            intermed.sort()
            variant = intermed[0][1]
            beaddict['nodes'].update({variant: []})

            for coldate, accn, label1 in intermed:
                beaddict['nodes'][variant].append([coldate, accn, label1])
        else:
            # generate beadplot data
            ctree = clustering.consensus(trees,
                                         cutoff=args.boot_cutoff,
                                         callback=callback)
            outfile.close()  # done with Phylo.parse generator

            ctree = beadplot.annotate_tree(ctree, label_dict)
            beaddict = beadplot.serialize_tree(ctree)

        beaddict.update({'sampled_variants': len(label_dict)})
        beaddict.update({'lineage': lineage})
        result.append(beaddict)

    return result
Exemple #4
0
            intermed.sort()
            variant = intermed[0][1]
            beaddict['nodes'].update({variant: []})
            for coldate, accn, label1 in intermed:
                beaddict['nodes'][variant].append({
                    'accession':
                    accn,
                    'label1':
                    label1,
                    'country':
                    label1.split('/')[1],
                    'coldate':
                    coldate
                })
            result.append(beaddict)
            continue

        # generate majority consensus tree
        ctree = clustering.consensus(trees, cutoff=args.cutoff)

        # collapse polytomies and label internal nodes
        label_dict = dict([(str(idx), lst) for idx, lst in enumerate(labels)])
        ctree = beadplot.annotate_tree(ctree, label_dict)

        # convert to JSON format
        beaddict = beadplot.serialize_tree(ctree)
        beaddict.update({'lineage': lineage})
        result.append(beaddict)

    args.outfile.write(json.dumps(result, indent=2))