def test_owndata(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" workdir = "tests/output/owndata" configfi = "tests/data/localblast.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) """Tests if your own input files will generate a data object of class AlignTreeTax """ if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=trfn, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) assert isinstance(data_obj, AlignTreeTax)
def test_config(): from physcraper import ConfigObj configfi = "tests/data/test.config" conf = ConfigObj(configfi, interactive=False) print(conf.__dict__.keys()) if conf.blast_loc != "remote": expected_keys = [ 'seq_len_perc', 'num_threads', 'phylesystem_loc', 'ncbi_parser_names_fn', 'ncbi_parser_nodes_fn', 'maxlen', 'id_pickle', 'hitlist_size', 'gb_id_filename', 'delay', 'unmapped', 'trim_perc', 'url_base', 'ott_ncbi', 'blast_loc', 'email', 'e_value_thresh', 'blastdb' ] else: expected_keys = [ 'seq_len_perc', 'num_threads', 'phylesystem_loc', 'maxlen', 'hitlist_size', 'gb_id_filename', 'delay', 'unmapped', 'trim_perc', 'url_base', 'ott_ncbi', 'blast_loc', 'id_pickle', 'email', 'e_value_thresh' ] assert len(conf.email.split("@")) == 2 # assert conf.url_base == None assert set(conf.__dict__.keys()) == set(expected_keys)
def test_add_all(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] for key in filteredScrape.sp_d: if len(filteredScrape.sp_d[key]) <= treshold: filteredScrape.add_all(key) treshold_undermin = 0 for key in filteredScrape.sp_d: for key2 in filteredScrape.sp_d[key]: if len(filteredScrape.sp_d[key]) <= treshold: if '^physcraper:status' in key2: if key2['^physcraper:status'].split( ' ')[0] not in filteredScrape.seq_filter: if key2['^physcraper:last_blasted'] == '1800/01/01': treshold_undermin += 1 add_all_thresholdmin = filteredScrape.filtered_seq assert treshold_undermin == len(add_all_thresholdmin)
def test_config(): from physcraper import ConfigObj configfi = "tests/data/localblast.config" conf = ConfigObj(configfi, interactive=False) assert conf.email == '*****@*****.**' assert conf.url_base == None assert conf.__dict__.keys() == expected_keys
def test(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" workdir = "tests/output/test_own_local" configfi = "tests/data/test.config" # configfi = "tests/data/aws.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) wrappers.own_data_run(seqaln, mattype, trfn, schema_trf, workdir, otu_jsonfi, configfi)
def test_no_mrca(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" workdir = "tests/output/test_mrcalist_local" configfi = "tests/data/test.config" otu_jsonfi = "{}/otu_dict.json".format(workdir) ingroup_mrca = None # setup the run if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) # print(ids.mrca_ott, ids.mrca_ncbi) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) filteredScrape = PhyscraperScrape(data_obj, ids, ingroup_mrca) filteredScrape.threshold = 5 assert filteredScrape.mrca_ncbi == 18794 blast_dir = "tests/data/precooked/fixed/tte_blast_files" filteredScrape._blasted = 1 filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() assert len(filteredScrape.new_seqs_otu_id) in [23,17] #Blurghhh, local vs remote searches get diffenrt number of seqs!
def test_remove_taxa_aln_tre(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) len_aln_before = len(filteredScrape.data.aln.as_string('phylip')) len_tre_before = len(filteredScrape.data.tre.as_string(schema="newick")) namespace_before = len(filteredScrape.data.aln.taxon_namespace) namespace_tre_before = len(filteredScrape.data.tre.taxon_namespace) for tax in filteredScrape.data.aln.taxon_namespace: filteredScrape.data.remove_taxa_aln_tre(tax.label) break len_aln_after = len(filteredScrape.data.aln.as_string('phylip')) len_tre_after = len(filteredScrape.data.tre.as_string(schema="newick")) namespace_after = len(filteredScrape.data.aln.taxon_namespace) namespace_tre_after = len(filteredScrape.data.tre.taxon_namespace) assert len_aln_before != len_aln_after assert len_tre_before != len_tre_after assert namespace_before != namespace_after assert namespace_tre_before != namespace_tre_after
def test_species_translation(): spn = "Mephitis mephitis" info = get_ott_taxon_info(spn) if info: ottid, ottname, ncbi_id = info a = ottid == 231602 tree_of_life.mrca(ott_ids=[ottid], wrap_response=False) ott_ids = [770315, 158484] ott_mrca = get_mrca_ott(ott_ids) b = ott_mrca == 312031 workdir = "tests/output/tmp" configfi = "tests/data/test.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) """Tests if your own input files will generate a data object of class AlignTreeTax """ conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) c = otu_json == expected_json assert a * b * c == 1
def test_id_dicts(): conf = ConfigObj(configfi, interactive=True) ids = IdDicts(conf, workdir=workdir) selection = random.sample(ids.ott_to_ncbi.keys(), 10) for ott_id in selection: ncbi_id = ids.ott_to_ncbi[ott_id] assert ids.ncbi_to_ott[ncbi_id] == ott_id
def test_run_raxml(): workdir = "tests/output/test_run_raxml" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) #load data data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) scraper = PhyscraperScrape(data_obj, ids) blast_dir = "tests/data/precooked/fixed/tte_blast_files" scraper._blasted = 1 # run needed functions # scraper.run_blast_wrapper() scraper.read_blast_wrapper(blast_dir=blast_dir) # scraper.align_query_seqs() # scraper.place_query_seqs() scraper.est_full_tree() # scraper.generate_streamed_alignment() assert os.path.exists("{}/RAxML_bestTree.{}".format( scraper.workdir, scraper.date))
def test_sp_seq_d(): absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] gi_sp_d = [] sp_d = filteredScrape.make_sp_dict() for key in sp_d: v = sp_d[key] for v2 in v: v2 = filteredScrape.data.otu_dict[v2] if '^physcraper:status' in v2: not_added = ['deleted', 'subsequence,', 'not'] if v2['^physcraper:status'].split(' ')[0] not in not_added: if '^ncbi:gi' in v2: gi_sp_d.append(v2['^ncbi:accession']) user_sp_d = [] for v in filteredScrape.sp_d.values(): for v2 in v: v2 = filteredScrape.data.otu_dict[v2] if '^physcraper:status' in v2 or u'^physcraper:status' in v2: if v2['^physcraper:status'].split( ' ')[0] not in filteredScrape.seq_filter: if v2['^physcraper:last_blasted'] != '1800/01/01': if '^user:TaxonName' in v2: user_sp_d.append(v2['^user:TaxonName']) elif '^ot:ottTaxonName' in v2: user_sp_d.append(v2['^ot:ottTaxonName']) filteredScrape.make_sp_seq_dict() gi_sp_seq_d = [] ott_sp_seq_d = [] for v in filteredScrape.sp_seq_d.values(): for k in v.keys(): # print(k) if len(k.split('.')) >= 2: # if type(k) == int: gi_sp_seq_d.append(k) else: # if type(k) == str or type(k) == unicode: ott_sp_seq_d.append(k) # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d)) assert len(ott_sp_seq_d) == len(user_sp_d) assert len(gi_sp_seq_d) == len(gi_sp_d)
def test_add_local(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) if os.path.exists(otu_jsonfi_local): otu_json_local = json.load(open(otu_jsonfi_local)) else: otu_json_local = OtuJsonDict(id_to_spn_addseq, ids) json.dump(otu_json_local, open(otu_jsonfi_local, "w")) sys.stdout.write("\ntest addLocal\n") # Prune sequences below a certain length threshold data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label='^ot:ottTaxonName', add_gb_id=True) data_obj.write_otus("otu_info", schema='table') data_obj.dump() sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = FilterBlast(data_obj, ids) filteredScrape.blacklist = blacklist if add_local_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data # filteredScrape.unpublished = True filteredScrape.data.unpubl_otu_json = otu_json_local filteredScrape.write_unpubl_blastdb(add_local_seq) # filteredScrape.make_otu_dict_entry_unpubl() filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() test = False for key in filteredScrape.data.otu_dict.keys(): if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys(): if filteredScrape.data.otu_dict[key][ '^ncbi:title'] == "unpublished": test = True break assert test == True
def sync_ncbi(configfi): conf = ConfigObj(configfi) subprocess.call( [ "rsync", "av", "ftp.ncbi.nih.gov::pub/taxonomy/gi_taxid_nucl.dmp.gz", "{}/gi_taxid_nucl.dmp.gz".format(conf.ncbi_dmp), ] ) subprocess.call(["gunzip", "{}/gi_taxid_nucl.dmp.gz".format(dir)])
def test_remove_id_seq(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 ############################# id_seq = ["TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC" ] # print("start test") tmp_dict = dict((taxon.label, filteredScrape.data.aln[taxon].symbols_as_string()) for taxon in filteredScrape.data.aln) old_seqs = tmp_dict.keys() avg_seqlen = sum(filteredScrape.data.orig_seqlen)/len(filteredScrape.data.orig_seqlen) assert filteredScrape.config.seq_len_perc <= 1 seq_len_cutoff = avg_seqlen*filteredScrape.config.seq_len_perc count=1 for item in id_seq: if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff: ott = "OTT_{}".format(count) count += 1 otu_id = ott filteredScrape.data.otu_dict[otu_id] = {} filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300 filteredScrape.data.otu_dict[otu_id]['^ncbi:accession'] = "KX494441" filteredScrape.data.otu_dict[otu_id]['^ncbi:title'] = "some random title" filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101 filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott filteredScrape.data.otu_dict[otu_id]['^physcraper:status'] = "query" filteredScrape.data.otu_dict[otu_id]['^ot:ottTaxonName'] = "Senecio vulgaris" filteredScrape.data.otu_dict[otu_id]['^physcraper:last_blasted'] = "1800/01/01" filteredScrape.seq_dict_build(item, otu_id, tmp_dict) for tax in old_seqs: try: del tmp_dict[tax] except KeyError: pass filteredScrape.new_seqs_otu_id = tmp_dict expected_add = 1 assert expected_add == len(filteredScrape.new_seqs_otu_id) sys.stdout.write("todo: add check that newly added seq are checked. they are, but there is no test")
def test_load_otol_data(): study_id = "pg_873" tree_id = "tree1679" seqaln = "tests/data/minitest.fas" mattype = "fasta" workdir = "tests/output/opentree_unmappedtaxa" absworkdir = os.path.abspath(workdir) configfi = "tests/data/test.config" ingroup_mrca = None if not os.path.exists(workdir): os.mkdir(workdir) conf = ConfigObj(configfi) data_obj = wrappers.load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id, tree_id, workdir) assert data_obj
def test(): if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) # that's the main function wrappers.own_data_run(seqaln, mattype, trfn, schema_trf, workdir, otu_jsonfi, configfi)
def test_sp_d(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ]
def test_write_blast(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() for taxonID in filteredScrape.sp_d: if len(filteredScrape.sp_seq_d[taxonID]) > treshold: blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0] seq = filteredScrape.sp_seq_d[taxonID][blast_seq] local_blast.write_filterblast_files(workdir, taxonID, seq) blast_db = filteredScrape.sp_seq_d[taxonID].keys()[1:] for blast_key in blast_db: seq = filteredScrape.sp_seq_d[taxonID][blast_key] local_blast.write_filterblast_files(workdir, blast_key, seq, db=True, fn=str(taxonID)) break blast_file_blast = "{}/blast/{}_tobeblasted".format(workdir, taxonID) # print(blast_file_blast) blast_file_db = "{}/blast/{}_db".format(workdir, taxonID) # print(blast_file_db, blast_file_blast) if os.path.exists(blast_file_blast): with open(blast_file_blast) as f: first_line = f.readline() assert len(first_line.strip()) != 0 if os.path.exists(blast_file_db): with open(blast_file_db) as f: first_line = f.readline() assert len(first_line.strip()) != 0
def test_trim(): #------------------------ seqaln= "tests/data/tiny_test_example/test_extralongseq.fas" mattype="fasta" treefile= "tests/data/tiny_test_example/test.tre" schema_trf = "newick" workdir="tests/output/test_trim" configfi = "tests/data/test.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi,"w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf = schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for tax, seq in data_obj.aln.items(): len_start = len(seq) next data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start != len_end
def test(): # define here your files seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" workdir = "tests/output/tiny_filter_own2" configfi = "tests/data/remote.config" otu_jsonfi = "{}/otu_dict.json".format(workdir) # change to your filtering criteria threshold = 2 selectby = "blast" downtorank = "species" ingroup_mrca = 723076 # setup the run if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) # select a wrapper function, depending on what you want to do, see short tutorial: wrappers.filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold, otu_jsonfi, configfi, selectby=selectby, downtorank=downtorank, ingroup_mrca=ingroup_mrca)
def filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold, id_to_spn, configfi, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output. """ license_print() debug("Debugging mode is on") print(workdir) print(os.path.exists(workdir)) if not os.path.exists(workdir): print("make wd") os.makedirs(workdir) conf = ConfigObj(configfi) ids = load_ids_obj(conf, workdir) make_otujsondict(id_to_spn, workdir, ids) # make json file for unpublished database if add_unpubl_seq is not None: make_otujsondict(id_to_spn_addseq_json, workdir, ids, local=True) # Generate an linked Alignment-Tree-Taxa object data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir, ingroup_mrca) filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj, downtorank, id_to_spn_addseq_json, ids, selectby, shared_blast_folder, threshold, ingroup_mrca) save_copy_code(workdir) return filteredScrape
def test_filter_length(): workdir = "tests/output/test_selectbylength" absworkdir = os.path.abspath(workdir) conf = ConfigObj("tests/data/test.config", interactive=False) threshold = 2 selectby = "length" downtorank = "species" add_unpubl_seq = None blacklist = None id_to_spn_addseq_json = None ingroup_mrca = None shared_blast_folder = None data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape.blacklist = blacklist sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None # filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.remove_identical_seqs() filteredScrape.dump() sys.stdout.write("Filter the sequences\n") length_unfiltered = len(filteredScrape.new_seqs_otu_id) # if threshold is not None: # filteredScrape.filter_seqs() length_filtered = len(filteredScrape.new_seqs)
def add_unpubl_to_backbone(seqaln, mattype, trfn, schema_trf, workdir, sp_info_jsonfi, configfi, add_unpubl_seq, id_to_spn_addseq_json, selectby=None, downtorank=None, threshold=None, blacklist=None, ingroup_mrca=None, shared_blast_folder=None): """ This uses the FilterBlast subclass to be able to filter the blast output. It adds unpublished data to an input tree (evalue should be higher than usual). Backbone will not be updated """ license_print() # read the config file into a configuration object conf = ConfigObj(configfi) # Generate an linked Alignment-Tree-Taxa object data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir, ingroup_mrca) ids = load_ids_obj(conf, workdir) filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj, downtorank, id_to_spn_addseq_json, ids, selectby, shared_blast_folder, threshold, ingroup_mrca, backbone=True) save_copy_code(workdir) return filteredScrape
def filter_OTOL( study_id, tree_id, seqaln, mattype, workdir, configfi, threshold, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, # path to local seq id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output using data from OToL.""" license_print() debug("Debugging mode is on") if not os.path.exists(workdir): os.makedirs(workdir) # read the config file into a configuration object conf = ConfigObj(configfi) # Generate an linked Alignment-Tree-Taxa object data_obj = load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id, tree_id, workdir) ids = load_ids_obj(conf, workdir) # make json file for unpublished database if add_unpubl_seq is not None: make_otujsondict(id_to_spn_addseq_json, workdir, ids, local=True) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj, downtorank, id_to_spn_addseq_json, ids, selectby, shared_blast_folder, threshold, ingroup_mrca) save_copy_code(workdir) return filteredScrape
def test_sp_d(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] gi_data_otu_dict_added = [] for v in filteredScrape.data.otu_dict.values(): if '^ncbi:gi' in v: if (v['^physcraper:status'].split(' ')[0] not in filteredScrape.seq_filter): gi_data_otu_dict_added.append(v['^ncbi:gi']) gi_sp_d = [] for key in filteredScrape.sp_d: v = filteredScrape.sp_d[key] for v2 in v: if '^ncbi:gi' in v2: gi_sp_d.append(v2['^ncbi:gi']) user_data_otu_dict = [] for v in filteredScrape.data.otu_dict.values(): if '^user:TaxonName' in v: user_data_otu_dict.append(v['^user:TaxonName']) user_sp_d = [] for v in filteredScrape.sp_d.values(): for v2 in v: if '^user:TaxonName' in v2: user_sp_d.append(v2['^user:TaxonName']) assert sorted(gi_data_otu_dict_added) == sorted(gi_sp_d) assert sorted(user_data_otu_dict) == sorted(user_sp_d)
def test_add_all(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = PhyscraperScrape(data_obj, ids) filteredScrape._blasted = 1 filteredScrape.threshold = threshold filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] filteredScrape.remove_identical_seqs() sp_d = filteredScrape.make_sp_dict(filteredScrape.new_seqs_otu_id) assert len(sp_d) == 5 for taxon in sp_d: assert len(sp_d[taxon]) <= threshold
def test_load_own_data(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" workdir = "tests/output/impls_mrcalist_local" configfi = "tests/data/test.config" ingroup_mrca = None if not os.path.exists(workdir): os.mkdir(workdir) conf = ConfigObj(configfi) ids = wrappers.load_ids_obj(conf, workdir) wrappers.make_otujsondict(id_to_spn, workdir, ids) data_obj = wrappers.load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir, ingroup_mrca) assert data_obj
def test(): if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for tax, seq in data_obj.aln.items(): len_start = len(seq) data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start == len_end for tax, seq in data_obj.aln.items(): len_start = len(seq) data_obj.config.trim_perc = 0.5 data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start > len_end
def own_data_run(seqaln, mattype, trfn, schema_trf, workdir, id_to_spn, configfi, ingroup_mrca=None, shared_blast_folder=None): """This is the wrapper function to start a PhyScraper standard run with your own data. You need: seqaln = path to sequence alignment file mattype = the format name of you alignment trfn = path to file with the phylogeny to update schema_trf = format type of your phylogeny workdir = define where your analysis files shall be stored sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function (usually, just leave it like it is in the example scripts.). configfi = path to your config file ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here, can be obtained bu running: python scripts/get_ott.py ingroup_name shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ license_print() debug("Debugging mode is on") if not os.path.exists(workdir): os.mkdir(workdir) conf = ConfigObj(configfi) ids = load_ids_obj(conf, workdir) make_otujsondict(id_to_spn, workdir, ids) data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir, ingroup_mrca) # Mapping identifiers between original data and NCBI requires an identifier dict object # scraper = PhyscraperScrape(data_obj, ids) scraper = PS_standard_run(data_obj, ids, shared_blast_folder) save_copy_code(workdir) return 1
def test_generate_ATT_from_file(): seqaln = "tests/data/input.fas" mattype = "fasta" workdir = "tests/fromfile" treefile = "tests/data/input.tre" otu_jsonfi = "tests/data/otu_dict.json" schema_trf = "newick" configfi = "tests/data/test.config" sys.stdout.write("\nTesting 'generate_ATT_from_files (fromfile.py)'\n") conf = ConfigObj(configfi, interactive=False) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi) data_obj == True