def test_remove_identical_seqs(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # print("start") scraper = PhyscraperScrape(data_obj, ids) scraper.ids.otu_rank = {} scraper.config.gifilename = False scraper._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper.read_blast_wrapper(blast_dir=blast_dir) #print scraper.ncbi_mrca assert (len(scraper.new_seqs) == 0) assert (len(scraper.data.aln) == 5) assert len(scraper.new_seqs_otu_id) == 17 #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test. #TODO find an example where we do get identical sequences and need to discard them # seqset = set() # for otu in scraper.new_seqs_otu_id: # seq = scraper.new_seqs_otu_id[otu] # if seq in seqset: # print otu # seqset.add(seq) #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence. ## for otu in scraper.new_seqs_otu_id: # qseq = scraper.new_seqs_otu_id[otu] # count = 0 # for seq in seqset: # if qseq in seq: # count += 1 # assert count == 1 ## for taxon in scraper.data.tre.taxon_namespace: # assert(taxon.label in scraper.data.otu_dict) # status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status') # assert(status in ('original', 'query')) aln_path1 = scraper.data.write_aln() aln_path = scraper.write_all_unaligned('test.fas') scraper.align_query_seqs() assert len(scraper.data.aln) == 22
def test_write_outputinfo(): workdir = "tests/output/test_write_output_files" configfi = "tests/data/test.config" downtorank = None absworkdir = os.path.abspath(workdir) fn_otu = os.path.join(absworkdir, "otu_seq_info.csv") fn_sampling = os.path.join(absworkdir, "taxon_sampling.csv") conf = physcraper.ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = physcraper.IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() # filteredScrape.sp_dict(downtorank) # filteredScrape.make_sp_seq_dict() filteredScrape.align_query_seqs() wrappers.write_out_files(filteredScrape, downtorank) with open(fn_otu) as fn: line = fn.readline() cnt = 1 while cnt <= 5: line = fn.readline() cnt += 1 assert type(line) == str assert line.split(",") >= 2 with open(fn_sampling) as fn: line = fn.readline() cnt = 1 while cnt <= 5: line = fn.readline() cnt += 1 assert type(line) == str assert line.split(",") >= 2
def test_internal_mpi(): import pickle import sys import os import subprocess from physcraper import ConfigObj, PhyscraperScrape, IdDicts from mpi4py import MPI # set up until test workdir = "tests/output/test_mpi_raxml" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) #load data data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) scraper = PhyscraperScrape(data_obj, ids) blast_dir = "tests/data/precooked/fixed/tte_blast_files" scraper._blasted = 1 # run needed functions scraper.read_blast_wrapper(blast_dir=blast_dir) scraper.remove_identical_seqs() scraper.data.write_papara_files() scraper.align_query_seqs() scraper.place_query_seqs() scraper.est_full_tree() # scraper.generate_streamed_alignment() assert os.path.exists("{}/RAxML_bestTree.{}".format( scraper.workdir, scraper.date)) # scraper.generate_streamed_alignment() if not os.path.exists("{}/previous_run".format(scraper.workdir)): os.mkdir("{}/previous_run".format(scraper.workdir)) os.system( "mv {}/papara_alignment.extended {}/previous_run/papara_alignment.extended" .format(scraper.workdir, scraper.workdir)) cwd = os.getcwd() # os.chdir(scraper.workdir) ntasks = os.environ.get('SLURM_NTASKS_PER_NODE') nnodes = os.environ.get("SLURM_JOB_NUM_NODES") print(nnodes, ntasks) env_var = int(nnodes) * int(ntasks) #env_var = os.environ.get('SLURM_JOB_CPUS_PER_NODE', 7) print(env_var) assert os.path.exists("{}/previous_run/papara_alignment.extended".format( scraper.workdir)) with cd(scraper.workdir): print("run with mpi") subprocess.call([ "mpiexec", "-n", "{}".format(env_var), "raxmlHPC-MPI-AVX2", "-m", "GTRCAT", "-s", "{}/previous_run/papara_alignment.extended".format( scraper.workdir), "-p", "1", "-f", "a", "-x", "1", "-#", "autoMRE", "-n", "all{}".format(scraper.date) ])