def PS_standard_run(data_obj, ids, shared_blast_folder): """ This is the standard mode for a Physcraper run: update aln and tre as long as new seqs are found, no filtering. :param data_obj: ATT object :param ids: IdDict object :param shared_blast_folder: path to folder for shared blast runs :return: PS run """ if os.path.isfile("{}/scrape_checkpoint.p".format(data_obj.workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") scraper = pickle.load( open("{}/scrape_checkpoint.p".format(data_obj.workdir), 'rb')) scraper.repeat = 1 else: scraper = PhyscraperScrape(data_obj, ids, ingroup_mrca) # run the analyses if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() scraper.dump("scrape_checkpoint.p") while scraper.repeat == 1: scraper.data.write_labelled(label="^ot:ottTaxonName") scraper.data.write_otus("otu_info", schema="table") if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() scraper.dump() write_out_files(scraper) writeinfofiles.get_additional_GB_info(scraper) return scraper
def test_filter_length(): workdir = "tests/output/test_selectbylength" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) threshold = 2 selectby = "length" downtorank = "species" add_unpubl_seq = None blacklist = None id_to_spn_addseq_json = None ingroup_mrca = None shared_blast_folder = None data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None # filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.remove_identical_seqs() filteredScrape.dump() sys.stdout.write("Filter the sequences\n") length_unfiltered = len(filteredScrape.new_seqs_otu_id) # if threshold is not None: # filteredScrape.filter_seqs() length_filtered = len(filteredScrape.new_seqs)