def test_remove_taxa_aln_tre(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) len_aln_before = len(filteredScrape.data.aln.as_string('phylip')) len_tre_before = len(filteredScrape.data.tre.as_string(schema="newick")) namespace_before = len(filteredScrape.data.aln.taxon_namespace) namespace_tre_before = len(filteredScrape.data.tre.taxon_namespace) for tax in filteredScrape.data.aln.taxon_namespace: filteredScrape.data.remove_taxa_aln_tre(tax.label) break len_aln_after = len(filteredScrape.data.aln.as_string('phylip')) len_tre_after = len(filteredScrape.data.tre.as_string(schema="newick")) namespace_after = len(filteredScrape.data.aln.taxon_namespace) namespace_tre_after = len(filteredScrape.data.tre.taxon_namespace) assert len_aln_before != len_aln_after assert len_tre_before != len_tre_after assert namespace_before != namespace_after assert namespace_tre_before != namespace_tre_after
def test_sp_d(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] gi_data_otu_dict_added = [] for v in filteredScrape.data.otu_dict.values(): if '^ncbi:gi' in v: if (v['^physcraper:status'].split(' ')[0] not in filteredScrape.seq_filter): gi_data_otu_dict_added.append(v['^ncbi:gi']) gi_sp_d = [] for key in filteredScrape.sp_d: v = filteredScrape.sp_d[key] for v2 in v: if '^ncbi:gi' in v2: gi_sp_d.append(v2['^ncbi:gi']) user_data_otu_dict = [] for v in filteredScrape.data.otu_dict.values(): if '^user:TaxonName' in v: user_data_otu_dict.append(v['^user:TaxonName']) user_sp_d = [] for v in filteredScrape.sp_d.values(): for v2 in v: if '^user:TaxonName' in v2: user_sp_d.append(v2['^user:TaxonName']) assert sorted(gi_data_otu_dict_added) == sorted(gi_sp_d) assert sorted(user_data_otu_dict) == sorted(user_sp_d)
def test_remove_id_seq(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 ############################# id_seq = ["TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC", "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC" ] # print("start test") tmp_dict = dict((taxon.label, filteredScrape.data.aln[taxon].symbols_as_string()) for taxon in filteredScrape.data.aln) old_seqs = tmp_dict.keys() avg_seqlen = sum(filteredScrape.data.orig_seqlen)/len(filteredScrape.data.orig_seqlen) assert filteredScrape.config.seq_len_perc <= 1 seq_len_cutoff = avg_seqlen*filteredScrape.config.seq_len_perc count=1 for item in id_seq: if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff: ott = "OTT_{}".format(count) count += 1 otu_id = ott filteredScrape.data.otu_dict[otu_id] = {} filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300 filteredScrape.data.otu_dict[otu_id]['^ncbi:accession'] = "KX494441" filteredScrape.data.otu_dict[otu_id]['^ncbi:title'] = "some random title" filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101 filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott filteredScrape.data.otu_dict[otu_id]['^physcraper:status'] = "query" filteredScrape.data.otu_dict[otu_id]['^ot:ottTaxonName'] = "Senecio vulgaris" filteredScrape.data.otu_dict[otu_id]['^physcraper:last_blasted'] = "1800/01/01" filteredScrape.seq_dict_build(item, otu_id, tmp_dict) for tax in old_seqs: try: del tmp_dict[tax] except KeyError: pass filteredScrape.new_seqs_otu_id = tmp_dict expected_add = 1 assert expected_add == len(filteredScrape.new_seqs_otu_id) sys.stdout.write("todo: add check that newly added seq are checked. they are, but there is no test")
def test_sp_seq_d(): absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] gi_sp_d = [] for key in filteredScrape.sp_d: v = filteredScrape.sp_d[key] for v2 in v: if '^physcraper:status' in v2: not_added = ['deleted', 'subsequence,', 'not'] if v2['^physcraper:status'].split(' ')[0] not in not_added: if '^ncbi:gi' in v2: gi_sp_d.append(v2['^ncbi:accession']) user_sp_d = [] for v in filteredScrape.sp_d.values(): for v2 in v: if '^physcraper:status' in v2 or u'^physcraper:status' in v2: if v2['^physcraper:status'].split( ' ')[0] not in filteredScrape.seq_filter: if v2['^physcraper:last_blasted'] != '1800/01/01': if '^user:TaxonName' in v2: user_sp_d.append(v2['^user:TaxonName']) elif '^ot:ottTaxonName' in v2: user_sp_d.append(v2['^ot:ottTaxonName']) filteredScrape.make_sp_seq_dict() gi_sp_seq_d = [] ott_sp_seq_d = [] for v in filteredScrape.sp_seq_d.values(): for k in v.keys(): # print(k) if len(k.split('.')) >= 2: # if type(k) == int: gi_sp_seq_d.append(k) else: # if type(k) == str or type(k) == unicode: ott_sp_seq_d.append(k) # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d)) assert len(ott_sp_seq_d) == len(user_sp_d) assert len(gi_sp_seq_d) == len(gi_sp_d)
def test_blacklist(): workdir = "tests/output/test_blacklist" configfi = "tests/data/test.config" # make one run without blacklist debug("run without blacklist") blacklist = None noblack = os.path.join(workdir, "noblacklist") absworkdir = os.path.abspath(noblack) if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) noblackScrape = FilterBlast(data_obj, ids) noblackScrape._blasted = 1 src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") # print(dest) full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) noblackScrape.read_blast_wrapper() noblackScrape.remove_identical_seqs() noblackScrape.generate_streamed_alignment() # one run with blacklist debug("run with blacklist") blacklist = ['JX895340.1'] absworkdir = os.path.abspath(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape.blacklist = blacklist filteredScrape._blasted = 1 if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")): os.makedirs(os.path.join(absworkdir, "current_blast_run/")) src = "tests/data/precooked/fixed/tte_blast_files" src_files = os.listdir(src) for file_name in src_files: dest = os.path.join(absworkdir, "current_blast_run/") full_file_name = os.path.join(src, file_name) if (os.path.isfile(full_file_name)): shutil.copy(full_file_name, dest) # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() print("RUN TESTS!") gi_l = [] gi_l_2 = [] for tax in filteredScrape.data.tre.taxon_namespace: gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l.append(gi_id) print(gi_l) for tax in noblackScrape.data.tre.taxon_namespace: # print(filteredScrape.data.otu_dict[tax.label]) gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession") gi_l_2.append(gi_id) print(gi_l_2) for item in blacklist: assert item not in gi_l print("RUN TESTS2!") assert item in gi_l_2 # # print("seq was not added in blacklist run") # print("inbetween step works") # test if it removes blacklist gi from already added aln: print("run with later blacklist") # else: # print("blacklist gi was added in previous run") # print("now we want to remove it.") len_before = (len(noblackScrape.data.tre.taxon_namespace)) noblackScrape.blacklist = blacklist noblackScrape.generate_streamed_alignment() assert len_before - 1 == len(noblackScrape.data.tre.taxon_namespace)
def test_add_local(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) if os.path.exists(otu_jsonfi_local): otu_json_local = json.load(open(otu_jsonfi_local)) else: otu_json_local = OtuJsonDict(id_to_spn_addseq, ids) json.dump(otu_json_local, open(otu_jsonfi_local, "w")) sys.stdout.write("\ntest addLocal\n") # Prune sequences below a certain length threshold data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label='^ot:ottTaxonName', add_gb_id=True) data_obj.write_otus("otu_info", schema='table') data_obj.dump() sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = FilterBlast(data_obj, ids) filteredScrape.blacklist = blacklist if add_local_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data # filteredScrape.unpublished = True filteredScrape.data.unpubl_otu_json = otu_json_local filteredScrape.write_unpubl_blastdb(add_local_seq) # filteredScrape.make_otu_dict_entry_unpubl() filteredScrape.run_blast_wrapper() filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() test = False for key in filteredScrape.data.otu_dict.keys(): if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys(): if filteredScrape.data.otu_dict[key][ '^ncbi:title'] == "unpublished": test = True break assert test == True
def run_with_settings(settings): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output.""" debug("Debugging mode is on") if os.path.isfile("{}/scrape_checkpoint.p".format(settings.workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") filteredScrape = pickle.load( open("{}/scrape_checkpoint.p".format(settings.workdir), "rb") ) filteredScrape.repeat = 1 else: conf = ConfigObj(settings.configfi) # print("config") debug(dir(conf)) debug(conf.email) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=settings.seqaln, mattype=settings.mattype, workdir=settings.workdir, treefile=settings.trfn, schema_trf=settings.schema_trf, otu_json=settings.spInfoDict, ingroup_mrca=None) # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) data_obj.write_otus("otu_info", schema="table") data_obj.dump() ids = IdDicts(conf, workdir=settings.workdir) filteredScrape = FilterBlast(data_obj, ids, settings) filteredScrape.add_setting_to_self(settings.downtorank, settings.threshold) filteredScrape.write_otu_info(settings.downtorank) if settings.add_unpubl_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data sys.stdout.write("Blasting against local unpublished data") filteredScrape.write_unpubl_blastdb(settings.add_unpubl_seq) filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.local_otu_json = settings.id_to_spn_addseq_json filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() filteredScrape.unpublished = False # run the ananlyses if filteredScrape.unpublished is not True: filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder) filteredScrape.remove_identical_seqs() filteredScrape.dump() if settings.threshold is not None: filteredScrape.sp_dict(settings.downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby) filteredScrape.replace_new_seq() debug("from replace to streamed aln") filteredScrape.generate_streamed_alignment() filteredScrape.dump() while filteredScrape.repeat is 1: filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) filteredScrape.data.write_otus("otu_info", schema="table") filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder) filteredScrape.remove_identical_seqs() if settings.threshold is not None: filteredScrape.sp_dict(settings.downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby) filteredScrape.replace_new_seq() filteredScrape.generate_streamed_alignment() filteredScrape.dump() filteredScrape.write_otu_info(settings.downtorank) return filteredScrape
def filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold, spInfoDict, configfi, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output. """ debug("Debugging mode is on") # debug(shared_blast_folder) # debug(some) # if _DEBUG_MK == 1: # random.seed(3269235691) print(workdir) if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") filteredScrape = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), 'rb')) filteredScrape.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=True) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, treefile=trfn, schema_trf=schema_trf, otu_json=spInfoDict, ingroup_mrca=ingroup_mrca) # Prune sequnces below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) data_obj.write_otus("otu_info", schema="table") data_obj.dump() sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = FilterBlast(data_obj, ids) filteredScrape.add_setting_to_self(downtorank, threshold) filteredScrape.blacklist = blacklist if add_unpubl_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data sys.stdout.write("Blasting against local unpublished data") filteredScrape.unpublished = True filteredScrape.write_unpubl_blastdb(add_unpubl_seq) filteredScrape.run_blast_wrapper(delay=14) print("add unpubl otu json") filteredScrape.data.unpubl_otu_json = id_to_spn_addseq_json print(filteredScrape.data.unpubl_otu_json) filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() filteredScrape.unpublished = False else: # run the analysis sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None filteredScrape.run_blast_wrapper(delay=14) filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() filteredScrape.dump() sys.stdout.write("Filter the sequences\n") if threshold is not None: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby) filteredScrape.replace_new_seq() sys.stdout.write("Calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.data.write_otus("otu_info", schema="table") filteredScrape.write_otu_info(downtorank) filteredScrape.dump() while filteredScrape.repeat == 1: filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) filteredScrape.data.write_otus("otu_info", schema="table") sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None filteredScrape.run_blast_wrapper(delay=14) filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() sys.stdout.write("Filter the sequences\n") if threshold is not None: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby) filteredScrape.replace_new_seq() filteredScrape.data.prune_short(0.75) sys.stdout.write("calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.dump() filteredScrape.write_otu_info(downtorank) # print(some) filteredScrape.write_otu_info(downtorank) return filteredScrape
def test_loop_for_write_blast_files(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape.add_setting_to_self(downtorank, threshold) # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() # this is the code of the first part of how many seq to keep. if threshold is bigger than number of seq for sp, just add all # print("run loop which we want to test") for key in filteredScrape.sp_d: if len(filteredScrape.sp_d[key]) > threshold: count_dict = filteredScrape.count_num_seq(key) if key in filteredScrape.sp_seq_d.keys(): seq_present = count_dict["seq_present"] query_count = count_dict["query_count"] if seq_present >= 1 and seq_present < threshold and count_dict[ "new_taxon"] is False and query_count != 0: if query_count + seq_present > threshold: taxonfn = filteredScrape.loop_for_write_blast_files( key) # MAKE TEST FOR loop_for_write_blast_files for key in filteredScrape.sp_d: count = 0 count_int = 0 count_gi_file = 0 count_str_file = 0 db = False blasted = False if len(filteredScrape.sp_d[key]) > threshold: for sp_keys in filteredScrape.sp_seq_d[key].keys(): if isinstance(sp_keys, str): count += 1 if isinstance(sp_keys, unicode): count += 1 else: count_int += 1 folder = '{}/blast/'.format(filteredScrape.workdir) for the_file in os.listdir(folder): spn = the_file.split("_")[0] spn = "_".join(the_file.split("_")[0]) file_type = the_file.split("_")[1] if spn == key and file_type == "db": # db = True f = open('{}/blast/{}'.format(filteredScrape.workdir, the_file)) for line in iter(f): if line[0] == ">": count_gi_file += 1 if spn == key and file_type == "tobeblasted": blasted = True count_str_file += 1 if blasted: if count + count_int != threshold: assert count_str_file == count if db: if count + count_int != threshold: assert count_gi_file == count_int
def test_write_blast(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 blast_dir = "tests/data/precooked/fixed/tte_blast_files" # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb')) filteredScrape.read_blast_wrapper(blast_dir=blast_dir) filteredScrape.remove_identical_seqs() filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() for taxonID in filteredScrape.sp_d: if len(filteredScrape.sp_seq_d[taxonID]) > treshold: blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0] seq = filteredScrape.sp_seq_d[taxonID][blast_seq] local_blast.write_filterblast_files(workdir, taxonID, seq) blast_db = filteredScrape.sp_seq_d[taxonID].keys()[1:] for blast_key in blast_db: seq = filteredScrape.sp_seq_d[taxonID][blast_key] local_blast.write_filterblast_files(workdir, blast_key, seq, db=True, fn=str(taxonID)) break blast_file_blast = "{}/blast/{}_tobeblasted".format(workdir, taxonID) # print(blast_file_blast) blast_file_db = "{}/blast/{}_db".format(workdir, taxonID) # print(blast_file_db, blast_file_blast) if os.path.exists(blast_file_blast): with open(blast_file_blast) as f: first_line = f.readline() assert len(first_line.strip()) != 0 if os.path.exists(blast_file_db): with open(blast_file_db) as f: first_line = f.readline() assert len(first_line.strip()) != 0
def test_add_all(): conf = ConfigObj(configfi, interactive=False) data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb')) data_obj.workdir = absworkdir ids = IdDicts(conf, workdir=data_obj.workdir) ids.acc_ncbi_dict = pickle.load( open("tests/data/precooked/tiny_acc_map.p", "rb")) filteredScrape = FilterBlast(data_obj, ids) filteredScrape._blasted = 1 filteredScrape.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.seq_filter = [ 'deleted', 'subsequence,', 'not', "removed", "deleted," ] for key in filteredScrape.sp_d: if len(filteredScrape.sp_d[key]) <= treshold: filteredScrape.add_all(key) treshold_undermin = 0 for key in filteredScrape.sp_d: for key2 in filteredScrape.sp_d[key]: if len(filteredScrape.sp_d[key]) <= treshold: if '^physcraper:status' in key2: if key2['^physcraper:status'].split( ' ')[0] not in filteredScrape.seq_filter: if key2['^physcraper:last_blasted'] == '1800/01/01': treshold_undermin += 1 add_all_thresholdmin = filteredScrape.filtered_seq assert treshold_undermin == len(add_all_thresholdmin)