Ejemplo n.º 1
0
    parser.add_argument(
        '--outdir',
        default='data/',
        help='optional, directory to write TreeTime output files')
    parser.add_argument('--ft2bin',
                        default='fasttree2',
                        help='optional, path to fasttree2 binary executable')
    parser.add_argument('--ttbin',
                        default='treetime',
                        help='optional, path to treetime binary executable')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    cb = Callback()

    cb.callback("Retrieving genomes")
    fasta = retrieve_genomes(args.db, ref_file=args.ref, misstol=args.misstol)

    cb.callback("Reconstructing tree with {}".format(args.ft2bin))
    nwk = fasttree(fasta, binpath=args.ft2bin)

    cb.callback("Reconstructing time-scaled tree with {}").format(args.ttbin)
    nexus_file = treetime(nwk,
                          fasta,
                          outdir=args.outdir,
                          binpath=args.ttbin,
                          clock=args.clock)

    cb.callback("")
Ejemplo n.º 2
0
    parser.add_argument('--ft2bin', default='fasttree2',
                        help='optional, path to fasttree2 binary executable')
    parser.add_argument('--ttbin', default='treetime',
                        help='optional, path to treetime binary executable')
    parser.add_argument('--lineages', type=str,
                        default=os.path.join(covizu.__path__[0], "data/pango-designation/lineages.csv"),
                        help="optional, path to CSV file containing Pango lineage designations.")

    parser.add_argument('--outfile', default='data/timetree.nwk',
                        help='output, path to write Newick tree string')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    cb = Callback()

    cb.callback("Retrieving genomes")
    with open(args.json) as handle:
        by_lineage = json.load(handle)

    cb.callback("Parsing Pango lineage designations")
    handle = open(args.lineages)
    header = next(handle)
    if header != 'taxon,lineage\n':
        cb.callback("Error: {} does not contain expected header row 'taxon,lineage'".format(args.lineages))
        sys.exit()
    lineages = {}
    for line in handle:
        taxon, lineage = line.strip().split(',')
        lineages.update({taxon: lineage})
    with open(args.ref) as handle:
        reflen = len(seq_utils.convert_fasta(handle)[0][1])

    loader = stream_local(args.infile, args.lineages, minlen=args.minlen,
                          mindate=args.mindate, callback=callback)
    batcher = gisaid_utils.batch_fasta(loader, size=args.batchsize)
    aligned = gisaid_utils.extract_features(batcher, ref_file=args.ref, binpath=args.mmbin,
                                            nthread=args.mmthreads, minlen=args.minlen)
    filtered = gisaid_utils.filter_problematic(aligned, vcf_file=args.vcf, cutoff=args.poisson_cutoff,
                                               callback=callback)
    return gisaid_utils.sort_by_lineage(filtered, callback=callback)


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # check that user has loaded openmpi module
    try:
        subprocess.check_call(['mpirun', '-np', '2', 'ls'], stdout=subprocess.DEVNULL)
    except FileNotFoundError:
        cb.callback("mpirun not loaded - run `module load openmpi/gnu`", level='ERROR')
        sys.exit()

    by_lineage = process_local(args, cb.callback)
    with open(args.bylineage, 'w') as handle:
        # export to file to process large lineages with MPI
        json.dump(by_lineage, handle)

    # reconstruct time-scaled tree
    timetree, residuals = build_timetree(by_lineage, args, cb.callback)
Ejemplo n.º 4
0
    with open(args.ref) as handle:
        reflen = len(seq_utils.convert_fasta(handle)[0][1])

    loader = stream_local(args.infile, args.pangolineages, minlen=args.minlen,
                          mindate=args.mindate, callback=callback)
    batcher = gisaid_utils.batch_fasta(loader, size=args.batchsize)
    aligned = gisaid_utils.extract_features(batcher, ref_file=args.ref, binpath=args.mmbin,
                                            nthread=args.mmthreads, minlen=args.minlen)
    filtered = gisaid_utils.filter_problematic(aligned, vcf_file=args.vcf, cutoff=args.poisson_cutoff,
                                               callback=callback)
    return gisaid_utils.sort_by_lineage(filtered, callback=callback)


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # check that user has loaded openmpi module
    try:
        subprocess.check_call(['mpirun', '-np', '2', 'ls'], stdout=subprocess.DEVNULL)
    except FileNotFoundError:
        cb.callback("mpirun not loaded - run `module load openmpi/gnu`", level='ERROR')
        sys.exit()

    # check that the user has included submodules
    if (not os.path.exists(os.path.join(covizu.__path__[0], "data/pango-designation/lineages.csv")) or 
            not os.path.exists(os.path.join(covizu.__path__[0], "data/ProblematicSites_SARS-CoV2/problematic_sites_sarsCov2.vcf"))):
        try:
            subprocess.check_call("git submodule init; git submodule update", shell=True)
        except:
            cb.callback("Error adding the required submodules")
Ejemplo n.º 5
0
    Called by batch.py via subprocess to handle lineages with excessive
    numbers of genomes, to process via MPI
    """
    try:
        from mpi4py import MPI
    except ModuleNotFoundError:
        print("Script requires mpi4py - https://pypi.org/project/mpi4py/")
        sys.exit()

    comm = MPI.COMM_WORLD
    my_rank = comm.Get_rank()
    nprocs = comm.Get_size()

    # command-line execution
    args = parse_args()
    cb = Callback(t0=args.timestamp, my_rank=my_rank, nprocs=nprocs)

    # import lineage data from file
    with open(args.json) as handle:
        recoded = json.load(handle)

    if args.mode == 'deep':
        union, labels, indexed = unpack_recoded(recoded,
                                                args.lineage,
                                                callback=cb.callback)

        # export map of sequence labels to tip indices
        lineage_name = args.lineage.replace('/', '_')  # issue #297

        outfile = os.path.join(args.outdir, '{}.nwk'.format(lineage_name))
        if len(indexed) == 1:
Ejemplo n.º 6
0
    if args.url is None and "GISAID_URL" in os.environ:
        args.url = os.environ["GISAID_URL"]
    if args.user is None and "GISAID_USER" in os.environ:
        args.user = os.environ["GISAID_USER"]
        # otherwise download_feed() will prompt for username
    if args.password is None and "GISAID_PSWD" in os.environ:
        args.password = os.environ["GISAID_PSWD"]
        # otherwise download_feed() will prompt for password

    return args


if __name__ == '__main__':
    args = parse_args()
    cb = Callback()

    cb.callback("Processing GISAID feed data")

    # download xz file if not specified by user
    if args.infile is None:
        args.infile = download_feed(args.url, args.user, args.password)

    loader = load_gisaid(args.infile,
                         minlen=args.minlen,
                         mindate=args.mindate,
                         debug=args.debug)
    batcher = batch_fasta(loader, size=args.batchsize)
    aligned = extract_features(batcher,
                               ref_file=args.ref,
                               binpath=args.binpath,
Ejemplo n.º 7
0
    Called by batch.py via subprocess to handle lineages with excessive
    numbers of genomes, to process via MPI
    """
    try:
        from mpi4py import MPI
    except ModuleNotFoundError:
        print("Script requires mpi4py - https://pypi.org/project/mpi4py/")
        sys.exit()

    comm = MPI.COMM_WORLD
    my_rank = comm.Get_rank()
    nprocs = comm.Get_size()

    # command-line execution
    args = parse_args()
    cb = Callback(t0=args.timestamp, my_rank=my_rank, nprocs=nprocs)

    # import lineage data from file
    cb.callback('loading JSON')
    with open(args.json) as handle:
        by_lineage = json.load(handle)

    records = by_lineage.get(args.lineage, None)
    if records is None:
        cb.callback("ERROR: JSON did not contain lineage {}".format(
            args.lineage))
        sys.exit()

    # generate distance matrices from bootstrap samples [[ MPI ]]
    union, labels, indexed = recode_features(records,
                                             callback=cb.callback,
Ejemplo n.º 8
0
    batcher = gisaid_utils.batch_fasta(loader, size=args.batchsize)
    aligned = gisaid_utils.extract_features(batcher,
                                            ref_file=args.ref,
                                            binpath=args.mmbin,
                                            nthread=args.mmthreads,
                                            minlen=args.minlen)
    filtered = gisaid_utils.filter_problematic(aligned,
                                               vcf_file=args.vcf,
                                               cutoff=args.poisson_cutoff,
                                               callback=callback)
    return gisaid_utils.sort_by_lineage(filtered, callback=callback)


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # check that user has loaded openmpi module
    try:
        subprocess.check_call(['mpirun', '-np', '2', 'ls'],
                              stdout=subprocess.DEVNULL)
    except FileNotFoundError:
        cb.callback("mpirun not loaded - run `module load openmpi/gnu`",
                    level='ERROR')
        sys.exit()

    # check that the user has included submodules
    if (not os.path.exists(
            os.path.join(covizu.__path__[0],
                         "data/pango-designation/lineages.csv")
    ) or not os.path.exists(
Ejemplo n.º 9
0
                        help="Write data to disk for lineages above this "
                        "threshold; otherwise work in RAM.  Override "
                        "with `--threads 1`.")
    parser.add_argument(
        "--cutoff",
        type=float,
        default=0.5,
        help="Bootstrap cutoff for consensus tree (default 0.5). "
        "Only used if --cons is specified.")
    return parser.parse_args()


if __name__ == "__main__":
    # command-line execution
    args = parse_args()
    cb = Callback()

    cb.callback('loading lineage classifications from database')
    lineages = db_utils.dump_lineages(args.db)

    cb.callback('loading JSON')
    features = import_json(args.json, vcf_file=args.vcf, callback=cb.callback)

    by_lineage = split_by_lineage(features, lineages)
    for lineage, lfeatures in by_lineage.items():
        cb.callback('start {}, {} entries'.format(lineage, len(lfeatures)))

        # calculate symmetric difference matrix and run NJ on bootstrap samples
        filtered = seq_utils.filter_outliers(lfeatures)
        trees, labels = build_trees(filtered,
                                    nboot=args.nboot,
    batcher = gisaid_utils.batch_fasta(loader, size=args.batchsize)
    aligned = gisaid_utils.extract_features(batcher,
                                            ref_file=args.ref,
                                            binpath=args.mmbin,
                                            nthread=args.mmthreads,
                                            minlen=args.minlen)
    filtered = gisaid_utils.filter_problematic(aligned,
                                               vcf_file=args.vcf,
                                               cutoff=args.poisson_cutoff,
                                               callback=callback)
    return gisaid_utils.sort_by_lineage(filtered, callback=callback)


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # check that user has loaded openmpi module
    try:
        subprocess.check_call(['mpirun', '-np', '2', 'ls'],
                              stdout=subprocess.DEVNULL)
    except FileNotFoundError:
        cb.callback("mpirun not loaded - run `module load openmpi/gnu`",
                    level='ERROR')
        sys.exit()

    # download xz file if not specified by user
    if args.infile is None:
        cb.callback("No input specified, downloading data from GISAID feed...")
        args.infile = gisaid_utils.download_feed(args.url, args.user,
                                                 args.password)
Ejemplo n.º 11
0
        type=float,
        default=0.5,
        help="Bootstrap cutoff for consensus tree (default 0.5). "
        "Only used if --cons is specified.")

    parser.add_argument("outfile",
                        type=argparse.FileType('w'),
                        default='data/clusters.json',
                        help="output, dest for JSON beadplot file")

    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # Generate time-scaled tree of Pangolin lineages
    cb.callback("Retrieving lineage genomes")
    fasta = treetime.retrieve_genomes(args.db,
                                      nthread=args.mmthreads,
                                      ref_file=args.ref,
                                      misstol=args.misstol,
                                      callback=cb.callback)

    cb.callback("Reconstructing tree with {}".format(args.ft2bin))
    nwk = treetime.fasttree(fasta, binpath=args.ft2bin)

    cb.callback("Reconstructing time-scaled tree with {}".format(args.ttbin))
    nexus_file = treetime.treetime(nwk,
                                   fasta,