Ejemplo n.º 1
0
def PS_standard_run(data_obj, ids, shared_blast_folder):
    """
    This is the standard mode for a Physcraper run:
    update aln and tre as long as new seqs are found, no filtering.

    :param data_obj: ATT object
    :param ids: IdDict object
    :param shared_blast_folder: path to folder for shared blast runs
    :return: PS run
    """
    if os.path.isfile("{}/scrape_checkpoint.p".format(data_obj.workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        scraper = pickle.load(
            open("{}/scrape_checkpoint.p".format(data_obj.workdir), 'rb'))
        scraper.repeat = 1
    else:
        scraper = PhyscraperScrape(data_obj, ids, ingroup_mrca)
        # run the analyses
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper()
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
        scraper.dump("scrape_checkpoint.p")
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper()
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
        scraper.dump()
        write_out_files(scraper)
    writeinfofiles.get_additional_GB_info(scraper)
    return scraper
Ejemplo n.º 2
0
def test_add_local():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    if os.path.exists(otu_jsonfi_local):
        otu_json_local = json.load(open(otu_jsonfi_local))
    else:
        otu_json_local = OtuJsonDict(id_to_spn_addseq, ids)
        json.dump(otu_json_local, open(otu_jsonfi_local, "w"))

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist


    if add_local_seq is not None:
        filteredScrape.unpublished = True
    if filteredScrape.unpublished is True:  # use unpublished data
        # filteredScrape.unpublished = True
        filteredScrape.data.unpubl_otu_json = otu_json_local
        filteredScrape.write_unpubl_blastdb(add_local_seq)

        # filteredScrape.make_otu_dict_entry_unpubl()
        filteredScrape.run_blast_wrapper()
        filteredScrape.read_blast_wrapper()
        filteredScrape.remove_identical_seqs()

    test = False
    for key in filteredScrape.data.otu_dict.keys():
        if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys():
            if filteredScrape.data.otu_dict[key]['^ncbi:title'] == "unpublished":
                test = True
                break
    assert test == True
Ejemplo n.º 3
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """looks for a json file to continue run, or builds and runs
    new analysis for as long as new seqs are found

    This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life.
    You need:
         seqaln = ID of alignment file
         mattype = the format name of you alignment
         trfn = Id of phylogeny to update
         workdir = define where your analysis files shall be stored
         configfi = path to your config file
         ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest

         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    debug("Debugging mode is on")

    conf = ConfigObj(configfi, interactive=False)
    if os.path.isfile("{}/att_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading data object from pickle file\n")
        data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb"))
#        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_phylesystem(aln=aln,
                                                 workdir=workdir,
                                                 study_id=study_id,
                                                 tree_id=tree_id,
                                                 phylesystem_loc=conf.phylesystem_loc,
                                                 ingroup_mrca=ingroup_mrca)
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
        # ids = IdDicts(conf, workdir="example")
        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
    if os.path.isfile(conf.id_pickle):
        sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle))
        ids = pickle.load(open(conf.id_pickle, "rb"))
    else:
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        ids.dump()
    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    scraper = PhyscraperScrape(data_obj, ids)
    # run the analyses
    if shared_blast_folder:
        scraper.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    scraper.run_blast_wrapper(delay=14)
    scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
    scraper.remove_identical_seqs()
    scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    # scraper.write_otu_info()

    return scraper
Ejemplo n.º 4
0
def own_data_run(seqaln,
                 mattype,
                 trfn,
                 schema_trf,
                 workdir,
                 sp_info_jsonfi,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """This is the wrapper function to start a PhyScraper run with your own data.
    You need:
         seqaln = path to sequence alignment file
         mattype = the format name of you alignment
         trfn = path to file with the phylogeny to update
         schema_trf = format type of your phylogeny
         workdir = define where your analysis files shall be stored
         sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function
                            (usually, just leave it like it is in the example scripts.).
         configfi = path to your config file
         ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here,
                        can be obtained bu running: python scripts/get_ott.py ingroup_name
         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """

    debug("Debugging mode is on")

    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: ATT\n")
        scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb"))
        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                           mattype=mattype,
                                           workdir=workdir,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=sp_info_jsonfi,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequences below a certain length threshold
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        sys.stdout.write("setting up ID dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        scraper = PhyscraperScrape(data_obj, ids)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        # run the analyses
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.run_blast_wrapper(delay=14)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    return 1