def load_ids_obj(conf, workdir): """ Generates the IdDict class object. :param conf: Config Object of physcraper class :param workdir: working directory :return: """ if os.path.isfile("{}/id_pickle.p".format(workdir)): sys.stdout.write("Reloading id dicts from {}\n".format(workdir)) ids = pickle.load(open("{}/id_pickle.p".format(workdir), "rb")) else: sys.stdout.write("setting up ID dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, "{}/id_pickle.p".format(workdir)) ids.dump("{}/id_pickle.p".format(workdir)) return ids
def standard_run(study_id, tree_id, seqaln, mattype, workdir, configfi, ingroup_mrca=None, shared_blast_folder=None): """looks for a json file to continue run, or builds and runs new analysis for as long as new seqs are found This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life. You need: seqaln = ID of alignment file mattype = the format name of you alignment trfn = Id of phylogeny to update workdir = define where your analysis files shall be stored configfi = path to your config file ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") conf = ConfigObj(configfi, interactive=False) if os.path.isfile("{}/att_checkpoint.p".format(workdir)): sys.stdout.write("Reloading data object from pickle file\n") data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb")) # scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_phylesystem(aln=aln, workdir=workdir, study_id=study_id, tree_id=tree_id, phylesystem_loc=conf.phylesystem_loc, ingroup_mrca=ingroup_mrca) # Mapping identifiers between OpenTree and NCBI requires and identifier dict object # ids = IdDicts(conf, workdir="example") # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() # Mapping identifiers between OpenTree and NCBI requires and identifier dict object if os.path.isfile(conf.id_pickle): sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle)) ids = pickle.load(open(conf.id_pickle, "rb")) else: sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) ids.dump() # Now combine the data, the ids, and the configuration into a single physcraper scrape object scraper = PhyscraperScrape(data_obj, ids) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() # scraper.write_otu_info() return scraper