def test_run_raxml(): workdir = "tests/output/test_run_raxml" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) #load data data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) scraper = PhyscraperScrape(data_obj, ids) blast_dir = "tests/data/precooked/fixed/tte_blast_files" scraper._blasted = 1 # run needed functions # scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=blast_dir) # scraper.align_query_seqs() # scraper.place_query_seqs() scraper.est_full_tree() # scraper.generate_streamed_alignment() assert os.path.exists("{}/RAxML_bestTree.{}".format( scraper.workdir, scraper.date))
def test_remove_identical_seqs(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) print("start") scraper = PhyscraperScrape(data_obj, ids) scraper.config.blast_loc = 'remote' scraper.ids.otu_rank = {} scraper.config.gifilename = False scraper._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper.read_blast_wrapper(blast_dir=blast_dir) a = len(scraper.new_seqs) == 40 b = len(scraper.data.aln) == 5 c = len(scraper.new_seqs_otu_id) == 0 scraper.remove_identical_seqs() d = len(scraper.new_seqs) == 40 e = len(scraper.data.aln) == 5 f = len(scraper.new_seqs_otu_id) == 38 g = 1 for taxon in scraper.data.tre.taxon_namespace: h = taxon.label in scraper.data.otu_dict g = g * h status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status') i = status in ('original', 'query') g = g * i # Second test checks that seq len prec is affecting results data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) #reload bc data object is mutable data_obj.workdir = absworkdir scraper2 = PhyscraperScrape(data_obj, ids) scraper2.config.blast_loc = 'remote' scraper2.ids.otu_rank = {} scraper2.config.gifilename = False j = len(scraper2.data.aln) == 5 # scraper2.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper2.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") scraper2.config.seq_len_perc = 0.998 # Change seq len percentage from default of 75% k = len(scraper2.new_seqs) == 40 l = len(scraper2.new_seqs_otu_id) == 0 scraper2.remove_identical_seqs() # print(scraper2.data.otu_dict) # print(len(scraper.new_seqs_otu_id), 38) # print(len(scraper2.new_seqs_otu_id), 36) m = len(scraper2.new_seqs_otu_id) == 36 count = 0 assert a * b * c * d * e * f * g * h * i * j * k * l * m == True
def test_remove_taxa_aln_tre(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) len_aln_before = len(filteredScrape.data.aln.as_string('phylip')) len_tre_before = len(filteredScrape.data.tre.as_string(schema="newick")) namespace_before = len(filteredScrape.data.aln.taxon_namespace) namespace_tre_before = len(filteredScrape.data.tre.taxon_namespace) for tax in filteredScrape.data.aln.taxon_namespace: filteredScrape.data.remove_taxa_aln_tre(tax.label) break len_aln_after = len(filteredScrape.data.aln.as_string('phylip')) len_tre_after = len(filteredScrape.data.tre.as_string(schema="newick")) namespace_after = len(filteredScrape.data.aln.taxon_namespace) namespace_tre_after = len(filteredScrape.data.tre.taxon_namespace) assert len_aln_before != len_aln_after assert len_tre_before != len_tre_after assert namespace_before != namespace_after assert namespace_tre_before != namespace_tre_after
def test_add_all(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] for key in filteredScrape.sp_d: if len(filteredScrape.sp_d[key]) <= treshold: filteredScrape.add_all(key) treshold_undermin = 0 for key in filteredScrape.sp_d: for key2 in filteredScrape.sp_d[key]: if len(filteredScrape.sp_d[key]) <= treshold: if '^physcraper:status' in key2: if key2['^physcraper:status'].split( ' ')[0] not in filteredScrape.seq_filter: if key2['^physcraper:last_blasted'] == '1800/01/01': treshold_undermin += 1 add_all_thresholdmin = filteredScrape.filtered_seq assert treshold_undermin == len(add_all_thresholdmin)
def test_sp_seq_d(): absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] gi_sp_d = [] sp_d = filteredScrape.make_sp_dict() for key in sp_d: v = sp_d[key] for v2 in v: v2 = filteredScrape.data.otu_dict[v2] if '^physcraper:status' in v2: not_added = ['deleted', 'subsequence,', 'not'] if v2['^physcraper:status'].split(' ')[0] not in not_added: if '^ncbi:gi' in v2: gi_sp_d.append(v2['^ncbi:accession']) user_sp_d = [] for v in filteredScrape.sp_d.values(): for v2 in v: v2 = filteredScrape.data.otu_dict[v2] if '^physcraper:status' in v2 or u'^physcraper:status' in v2: if v2['^physcraper:status'].split( ' ')[0] not in filteredScrape.seq_filter: if v2['^physcraper:last_blasted'] != '1800/01/01': if '^user:TaxonName' in v2: user_sp_d.append(v2['^user:TaxonName']) elif '^ot:ottTaxonName' in v2: user_sp_d.append(v2['^ot:ottTaxonName']) filteredScrape.make_sp_seq_dict() gi_sp_seq_d = [] ott_sp_seq_d = [] for v in filteredScrape.sp_seq_d.values(): for k in v.keys(): # print(k) if len(k.split('.')) >= 2: # if type(k) == int: gi_sp_seq_d.append(k) else: # if type(k) == str or type(k) == unicode: ott_sp_seq_d.append(k) # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d)) assert len(ott_sp_seq_d) == len(user_sp_d) assert len(gi_sp_seq_d) == len(gi_sp_d)
def test_add_local(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) if os.path.exists(otu_jsonfi_local): otu_json_local = json.load(open(otu_jsonfi_local)) else: otu_json_local = OtuJsonDict(id_to_spn_addseq, ids) json.dump(otu_json_local, open(otu_jsonfi_local, "w")) sys.stdout.write("\ntest addLocal\n") # Prune sequences below a certain length threshold data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label='^ot:ottTaxonName', add_gb_id=True) data_obj.write_otus("otu_info", schema='table') data_obj.dump() sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = FilterBlast(data_obj, ids) filteredScrape.blacklist = blacklist if add_local_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data # filteredScrape.unpublished = True filteredScrape.data.unpubl_otu_json = otu_json_local filteredScrape.write_unpubl_blastdb(add_local_seq) # filteredScrape.make_otu_dict_entry_unpubl() filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() test = False for key in filteredScrape.data.otu_dict.keys(): if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys(): if filteredScrape.data.otu_dict[key][ '^ncbi:title'] == "unpublished": test = True break assert test == True
def test_remove_id_seq(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 ############################# id_seq = ["TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC" ] # print("start test") tmp_dict = dict((taxon.label, filteredScrape.data.aln[taxon].symbols_as_string()) for taxon in filteredScrape.data.aln) old_seqs = tmp_dict.keys() avg_seqlen = sum(filteredScrape.data.orig_seqlen)/len(filteredScrape.data.orig_seqlen) assert filteredScrape.config.seq_len_perc <= 1 seq_len_cutoff = avg_seqlen*filteredScrape.config.seq_len_perc count=1 for item in id_seq: if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff: ott = "OTT_{}".format(count) count += 1 otu_id = ott filteredScrape.data.otu_dict[otu_id] = {} filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300 filteredScrape.data.otu_dict[otu_id]['^ncbi:accession'] = "KX494441" filteredScrape.data.otu_dict[otu_id]['^ncbi:title'] = "some random title" filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101 filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott filteredScrape.data.otu_dict[otu_id]['^physcraper:status'] = "query" filteredScrape.data.otu_dict[otu_id]['^ot:ottTaxonName'] = "Senecio vulgaris" filteredScrape.data.otu_dict[otu_id]['^physcraper:last_blasted'] = "1800/01/01" filteredScrape.seq_dict_build(item, otu_id, tmp_dict) for tax in old_seqs: try: del tmp_dict[tax] except KeyError: pass filteredScrape.new_seqs_otu_id = tmp_dict expected_add = 1 assert expected_add == len(filteredScrape.new_seqs_otu_id) sys.stdout.write("todo: add check that newly added seq are checked. they are, but there is no test")
def test_remove_identical_seqs(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # print("start") scraper = PhyscraperScrape(data_obj, ids) scraper.ids.otu_rank = {} scraper.config.gifilename = False scraper._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb')) scraper.read_blast_wrapper(blast_dir=blast_dir) #print scraper.ncbi_mrca assert (len(scraper.new_seqs) == 0) assert (len(scraper.data.aln) == 5) assert len(scraper.new_seqs_otu_id) == 17 #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test. #TODO find an example where we do get identical sequences and need to discard them # seqset = set() # for otu in scraper.new_seqs_otu_id: # seq = scraper.new_seqs_otu_id[otu] # if seq in seqset: # print otu # seqset.add(seq) #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence. ## for otu in scraper.new_seqs_otu_id: # qseq = scraper.new_seqs_otu_id[otu] # count = 0 # for seq in seqset: # if qseq in seq: # count += 1 # assert count == 1 ## for taxon in scraper.data.tre.taxon_namespace: # assert(taxon.label in scraper.data.otu_dict) # status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status') # assert(status in ('original', 'query')) aln_path1 = scraper.data.write_aln() aln_path = scraper.write_all_unaligned('test.fas') scraper.align_query_seqs() assert len(scraper.data.aln) == 22
def test_id_dicts(): conf = ConfigObj(configfi, interactive=True) ids = IdDicts(conf, workdir=workdir) selection = random.sample(ids.ott_to_ncbi.keys(), 10) for ott_id in selection: ncbi_id = ids.ott_to_ncbi[ott_id] assert ids.ncbi_to_ott[ncbi_id] == ott_id
def test_no_mrca(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" workdir = "tests/output/test_mrcalist_local" configfi = "tests/data/test.config" otu_jsonfi = "{}/otu_dict.json".format(workdir) ingroup_mrca = None # setup the run if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) # print(ids.mrca_ott, ids.mrca_ncbi) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) filteredScrape = PhyscraperScrape(data_obj, ids, ingroup_mrca) filteredScrape.threshold = 5 assert filteredScrape.mrca_ncbi == 18794 blast_dir = "tests/data/precooked/fixed/tte_blast_files" filteredScrape._blasted = 1 filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() assert len(filteredScrape.new_seqs_otu_id) in [23,17] #Blurghhh, local vs remote searches get diffenrt number of seqs!
def test_species_translation(): spn = "Mephitis mephitis" info = get_ott_taxon_info(spn) if info: ottid, ottname, ncbi_id = info a = ottid == 231602 tree_of_life.mrca(ott_ids=[ottid], wrap_response=False) ott_ids = [770315, 158484] ott_mrca = get_mrca_ott(ott_ids) b = ott_mrca == 312031 workdir = "tests/output/tmp" configfi = "tests/data/test.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) """Tests if your own input files will generate a data object of class AlignTreeTax """ conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) c = otu_json == expected_json assert a * b * c == 1
def test_owndata(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" workdir = "tests/output/owndata" configfi = "tests/data/localblast.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) """Tests if your own input files will generate a data object of class AlignTreeTax """ if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=trfn, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) assert isinstance(data_obj, AlignTreeTax)
def test(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" workdir = "tests/output/test_own_local" configfi = "tests/data/test.config" # configfi = "tests/data/aws.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) wrappers.own_data_run(seqaln, mattype, trfn, schema_trf, workdir, otu_jsonfi, configfi)
def load_ids_obj(conf, workdir): """ Generates the IdDict class object. :param conf: Config Object of physcraper class :param workdir: working directory :return: """ if os.path.isfile("{}/id_pickle.p".format(workdir)): sys.stdout.write("Reloading id dicts from {}\n".format(workdir)) ids = pickle.load(open("{}/id_pickle.p".format(workdir), "rb")) else: sys.stdout.write("setting up ID dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, "{}/id_pickle.p".format(workdir)) ids.dump("{}/id_pickle.p".format(workdir)) return ids
def test_sp_d(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ]
def test_write_blast(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() for taxonID in filteredScrape.sp_d: if len(filteredScrape.sp_seq_d[taxonID]) > treshold: blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0] seq = filteredScrape.sp_seq_d[taxonID][blast_seq] local_blast.write_filterblast_files(workdir, taxonID, seq) blast_db = filteredScrape.sp_seq_d[taxonID].keys()[1:] for blast_key in blast_db: seq = filteredScrape.sp_seq_d[taxonID][blast_key] local_blast.write_filterblast_files(workdir, blast_key, seq, db=True, fn=str(taxonID)) break blast_file_blast = "{}/blast/{}_tobeblasted".format(workdir, taxonID) # print(blast_file_blast) blast_file_db = "{}/blast/{}_db".format(workdir, taxonID) # print(blast_file_db, blast_file_blast) if os.path.exists(blast_file_blast): with open(blast_file_blast) as f: first_line = f.readline() assert len(first_line.strip()) != 0 if os.path.exists(blast_file_db): with open(blast_file_db) as f: first_line = f.readline() assert len(first_line.strip()) != 0
def test_sp_d(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] gi_data_otu_dict_added = [] for v in filteredScrape.data.otu_dict.values(): if '^ncbi:gi' in v: if (v['^physcraper:status'].split(' ')[0] not in filteredScrape.seq_filter): gi_data_otu_dict_added.append(v['^ncbi:gi']) gi_sp_d = [] for key in filteredScrape.sp_d: v = filteredScrape.sp_d[key] for v2 in v: if '^ncbi:gi' in v2: gi_sp_d.append(v2['^ncbi:gi']) user_data_otu_dict = [] for v in filteredScrape.data.otu_dict.values(): if '^user:TaxonName' in v: user_data_otu_dict.append(v['^user:TaxonName']) user_sp_d = [] for v in filteredScrape.sp_d.values(): for v2 in v: if '^user:TaxonName' in v2: user_sp_d.append(v2['^user:TaxonName']) assert sorted(gi_data_otu_dict_added) == sorted(gi_sp_d) assert sorted(user_data_otu_dict) == sorted(user_sp_d)
def test_filter_length(): workdir = "tests/output/test_selectbylength" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) threshold = 2 selectby = "length" downtorank = "species" add_unpubl_seq = None blacklist = None id_to_spn_addseq_json = None ingroup_mrca = None shared_blast_folder = None data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None # filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.remove_identical_seqs() filteredScrape.dump() sys.stdout.write("Filter the sequences\n") length_unfiltered = len(filteredScrape.new_seqs_otu_id) # if threshold is not None: # filteredScrape.filter_seqs() length_filtered = len(filteredScrape.new_seqs)
def test_add_all(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 filteredScrape.threshold = threshold filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] filteredScrape.remove_identical_seqs() sp_d = filteredScrape.make_sp_dict(filteredScrape.new_seqs_otu_id) assert len(sp_d) == 5 for taxon in sp_d: assert len(sp_d[taxon]) <= threshold
def test_add_local(): data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) if os.path.exists(otu_jsonfi_local): otu_json_local = json.load(open(otu_jsonfi_local)) else: otu_json_local = OtuJsonDict(id_to_spn_addseq, ids) json.dump(otu_json_local, open(otu_jsonfi_local, "w")) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist if add_local_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data # filteredScrape.unpublished = True filteredScrape.data.unpubl_otu_json = otu_json_local filteredScrape.write_unpubl_blastdb(add_local_seq) # filteredScrape.make_otu_dict_entry_unpubl() filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() test = False for key in filteredScrape.data.otu_dict.keys(): if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys(): if filteredScrape.data.otu_dict[key]['^ncbi:title'] == "unpublished": test = True break assert test == True
def test(): if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) # that's the main function wrappers.own_data_run(seqaln, mattype, trfn, schema_trf, workdir, otu_jsonfi, configfi)
def test_trim(): #------------------------ seqaln= "tests/data/tiny_test_example/test_extralongseq.fas" mattype="fasta" treefile= "tests/data/tiny_test_example/test.tre" schema_trf = "newick" workdir="tests/output/test_trim" configfi = "tests/data/test.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi,"w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf = schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for tax, seq in data_obj.aln.items(): len_start = len(seq) next data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start != len_end
def test(): # define here your files seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" workdir = "tests/output/tiny_filter_own2" configfi = "tests/data/remote.config" otu_jsonfi = "{}/otu_dict.json".format(workdir) # change to your filtering criteria threshold = 2 selectby = "blast" downtorank = "species" ingroup_mrca = 723076 # setup the run if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) # select a wrapper function, depending on what you want to do, see short tutorial: wrappers.filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold, otu_jsonfi, configfi, selectby=selectby, downtorank=downtorank, ingroup_mrca=ingroup_mrca)
def test(): if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for tax, seq in data_obj.aln.items(): len_start = len(seq) data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start == len_end for tax, seq in data_obj.aln.items(): len_start = len(seq) data_obj.config.trim_perc = 0.5 data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start > len_end
def test_compare_json(): expected_json = { 'otuSdoronicum': { '^ncbi:taxon': u'462523', '^ot:ottTaxonName': u'Senecio doronicum', '^ncbi:TaxonName': 'Senecio doronicum', '^physcraper:TaxonName': 'Senecio doronicum', '^physcraper:status': 'original', '^ot:ottId': 318436, '^user:TaxonName': 'Senecio_doronicum', '^ot:originalLabel': 'S_doronicum', '^physcraper:last_blasted': None }, 'otuSlagascanus': { '^ncbi:taxon': u'1268580', '^ot:ottTaxonName': u'Senecio lagascanus', '^ncbi:TaxonName': 'Senecio lagascanus', '^physcraper:TaxonName': 'Senecio lagascanus', '^physcraper:status': 'original', '^ot:ottId': 640718, '^user:TaxonName': 'Senecio_lagascanus', '^ot:originalLabel': 'S_lagascanus', '^physcraper:last_blasted': None }, 'otu2029doronicum': { '^ncbi:taxon': u'462523', '^ot:ottTaxonName': u'Senecio doronicum', '^ncbi:TaxonName': 'Senecio doronicum', '^physcraper:TaxonName': 'Senecio doronicum', '^physcraper:status': 'original', '^ot:ottId': 318436, '^user:TaxonName': 'Senecio_doronicum', '^ot:originalLabel': '2029_doronicum', '^physcraper:last_blasted': None }, 'otuSlopezii': { '^ncbi:taxon': u'1268581', '^ot:ottTaxonName': u'Senecio lopezii', '^ncbi:TaxonName': 'Senecio lopezii', '^physcraper:TaxonName': 'Senecio lopezii', '^physcraper:status': 'original', '^ot:ottId': 688688, '^user:TaxonName': 'Senecio_lopezii', '^ot:originalLabel': 'S_lopezii', '^physcraper:last_blasted': None }, 'otuSscopolii': { '^ncbi:taxon': u'1268589', '^ot:ottTaxonName': u'Senecio scopolii', '^ncbi:TaxonName': 'Senecio scopolii', '^physcraper:TaxonName': 'Senecio scopolii', '^physcraper:status': 'original', '^ot:ottId': 688671, '^user:TaxonName': 'Senecio_scopolii', '^ot:originalLabel': 'S_scopolii', '^physcraper:last_blasted': None } } workdir = "tests/output/tmp" configfi = "tests/data/test.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) print(otu_json) assert otu_json == expected_json
def test_internal_mpi(): import pickle import sys import os import subprocess from physcraper import ConfigObj, PhyscraperScrape, IdDicts from mpi4py import MPI # set up until test workdir = "tests/output/test_mpi_raxml" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) #load data data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) scraper = PhyscraperScrape(data_obj, ids) blast_dir = "tests/data/precooked/fixed/tte_blast_files" scraper._blasted = 1 # run needed functions scraper.read_blast_wrapper(blast_dir=blast_dir) scraper.remove_identical_seqs() scraper.data.write_papara_files() scraper.align_query_seqs() scraper.place_query_seqs() scraper.est_full_tree() # scraper.generate_streamed_alignment() assert os.path.exists("{}/RAxML_bestTree.{}".format( scraper.workdir, scraper.date)) # scraper.generate_streamed_alignment() if not os.path.exists("{}/previous_run".format(scraper.workdir)): os.mkdir("{}/previous_run".format(scraper.workdir)) os.system( "mv {}/papara_alignment.extended {}/previous_run/papara_alignment.extended" .format(scraper.workdir, scraper.workdir)) cwd = os.getcwd() # os.chdir(scraper.workdir) ntasks = os.environ.get('SLURM_NTASKS_PER_NODE') nnodes = os.environ.get("SLURM_JOB_NUM_NODES") print(nnodes, ntasks) env_var = int(nnodes) * int(ntasks) #env_var = os.environ.get('SLURM_JOB_CPUS_PER_NODE', 7) print(env_var) assert os.path.exists("{}/previous_run/papara_alignment.extended".format( scraper.workdir)) with cd(scraper.workdir): print("run with mpi") subprocess.call([ "mpiexec", "-n", "{}".format(env_var), "raxmlHPC-MPI-AVX2", "-m", "GTRCAT", "-s", "{}/previous_run/papara_alignment.extended".format( scraper.workdir), "-p", "1", "-f", "a", "-x", "1", "-#", "autoMRE", "-n", "all{}".format(scraper.date) ])
def test_loop_for_write_blast_files(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape.add_setting_to_self(downtorank, threshold) # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() # this is the code of the first part of how many seq to keep. if threshold is bigger than number of seq for sp, just add all # print("run loop which we want to test") for key in filteredScrape.sp_d: if len(filteredScrape.sp_d[key]) > threshold: count_dict = filteredScrape.count_num_seq(key) if key in filteredScrape.sp_seq_d.keys(): seq_present = count_dict["seq_present"] query_count = count_dict["query_count"] if seq_present >= 1 and seq_present < threshold and count_dict["new_taxon"] is False and query_count != 0: if query_count + seq_present > threshold: taxonfn = filteredScrape.loop_for_write_blast_files(key) # MAKE TEST FOR loop_for_write_blast_files for key in filteredScrape.sp_d: count = 0 count_int = 0 count_gi_file = 0 count_str_file = 0 db = False blasted = False if len(filteredScrape.sp_d[key]) > threshold: for sp_keys in filteredScrape.sp_seq_d[key].keys(): if isinstance(sp_keys, str): count += 1 if isinstance(sp_keys, unicode): count += 1 else: count_int += 1 folder = '{}/blast/'.format(filteredScrape.workdir) for the_file in os.listdir(folder): spn = the_file.split("_")[0] spn = "_".join(the_file.split("_")[0]) file_type = the_file.split("_")[1] if spn == key and file_type == "db": # db = True f = open('{}/blast/{}'.format(filteredScrape.workdir, the_file)) for line in iter(f): if line[0] == ">": count_gi_file += 1 if spn == key and file_type == "tobeblasted": blasted = True count_str_file += 1 if blasted: if count + count_int != threshold: assert count_str_file == count if db: if count + count_int != threshold: assert count_gi_file == count_int
trfn = "tests/data/tiny_comb_ets/tiny_comb_ets.tre" schema_trf = "newick" id_to_spn = r"tests/data/tiny_comb_ets/nicespl.csv" workdir = "tiny_comb_ets" configfi = "tests/data/localblast.config" otu_jsonfi = "{}/otu_dict.json".format(workdir) threshold = 2 selectby = "blast" downtorank = None if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) wrappers.filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold, otu_jsonfi,
def test_blacklist(): workdir = "tests/output/test_blacklist" configfi = "tests/data/test.config" # make one run without blacklist debug("run without blacklist") blacklist = None noblack = os.path.join(workdir, "noblacklist") absworkdir = os.path.abspath(noblack) if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) noblackScrape = FilterBlast(data_obj, ids) noblackScrape._blasted = 1 src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") # print(dest) full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) noblackScrape.read_blast_wrapper() noblackScrape.remove_identical_seqs() noblackScrape.generate_streamed_alignment() # one run with blacklist debug("run with blacklist") blacklist = ['JX895340.1'] absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape.blacklist = blacklist filteredScrape._blasted = 1 if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() print("RUN TESTS!") gi_l = [] gi_l_2 = [] for tax in filteredScrape.data.tre.taxon_namespace: gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l.append(gi_id) print(gi_l) for tax in noblackScrape.data.tre.taxon_namespace: # print(filteredScrape.data.otu_dict[tax.label]) gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l_2.append(gi_id) print(gi_l_2) for item in blacklist: assert item not in gi_l print("RUN TESTS2!") assert item in gi_l_2 # # print("seq was not added in blacklist run") # print("inbetween step works") # test if it removes blacklist gi from already added aln: print("run with later blacklist") # else: # print("blacklist gi was added in previous run") # print("now we want to remove it.") len_before = (len(noblackScrape.data.tre.taxon_namespace)) noblackScrape.blacklist = blacklist noblackScrape.generate_streamed_alignment() assert len_before - 1 == len(noblackScrape.data.tre.taxon_namespace)
configfi = "tests/data/localblast.config" otu_jsonfi = "{}/otu_dict.json".format(workdir) # change to your filtering criteria threshold = 2 selectby = "blast" downtorank = "species" shared_blast_folder = "/home/blubb/Documents/gitdata/physcraper/shared_runs/" ingroup_mrca = 723076 # setup the run if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) # select a wrapper function, depending on what you want to do, see short tutorial: wrappers.filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold,