def test_prune_short(): if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = physcraper.ConfigObj(configfi, interactive=False) ids = physcraper.IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): otu_json = json.load(open(otu_jsonfi)) else: otu_json = physcraper.OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) data_obj = physcraper.generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) len_before = len(data_obj.tre.taxon_namespace) data_obj.prune_short(0.9) len_after = len(data_obj.tre.taxon_namespace) assert len_before > len_after
def test_prune_short(): if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = physcraper.ConfigObj(configfi, interactive=False) conf.blast_loc = 'remote' #saves time over loading names and nodes, and they aren't used here ids = physcraper.IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): otu_json = json.load(open(otu_jsonfi)) else: otu_json = physcraper.OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) data_obj = physcraper.generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) data_obj.config.seq_len_perc = 0.9 len_before = len(data_obj.tre.taxon_namespace) data_obj.prune_short() len_after = len(data_obj.tre.taxon_namespace) assert len_before > len_after
def test_owndata(): seqaln = "tests/data/tiny_test_example/test.fas" mattype = "fasta" trfn = "tests/data/tiny_test_example/test.tre" schema_trf = "newick" workdir = "tests/output/owndata" configfi = "tests/data/localblast.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) """Tests if your own input files will generate a data object of class AlignTreeTax """ if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=trfn, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) assert isinstance(data_obj, AlignTreeTax)
def test_trim(): #------------------------ seqaln= "tests/data/tiny_test_example/test_extralongseq.fas" mattype="fasta" treefile= "tests/data/tiny_test_example/test.tre" schema_trf = "newick" workdir="tests/output/test_trim" configfi = "tests/data/test.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "{}/otu_dict.json".format(workdir) if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) if os.path.exists(otu_jsonfi): print("load json") otu_json = json.load(open(otu_jsonfi)) else: otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi,"w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf = schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for tax, seq in data_obj.aln.items(): len_start = len(seq) next data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start != len_end
def load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir, ingroup_mrca): """ Generates ATT object from own data. :param conf: conf object from physcraper :param seqaln: sequence alignment file :param mattype: format of sequence alignment :param trfn: tree file :param schema_trf: format of tree file :param workdir: working directory :param ingroup_mrca: mrca of ingroup as OTT ID :return: ATT object """ otu_jsonfi = "{}/otu_dict.json".format(workdir) assert os.path.exists(otu_jsonfi) if os.path.isfile("{}/att_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: ATT\n") data_obj = pickle.load( open("{}/att_checkpoint.p".format(workdir), "rb")) else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=trfn, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=ingroup_mrca) # Prune sequences below a certain length threshold data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() assert isinstance(data_obj, AlignTreeTax) return data_obj
def test(): if not os.path.exists("{}".format(workdir)): os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=False) ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) json.dump(otu_json, open(otu_jsonfi, "w")) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for tax, seq in data_obj.aln.items(): len_start = len(seq) data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start == len_end for tax, seq in data_obj.aln.items(): len_start = len(seq) data_obj.config.trim_perc = 0.5 data_obj.trim() for tax, seq in data_obj.aln.items(): len_end = len(seq) assert len_start > len_end
def test_generate_ATT_from_file(): seqaln = "tests/data/input.fas" mattype = "fasta" workdir = "tests/fromfile" treefile = "tests/data/input.tre" otu_jsonfi = "tests/data/otu_dict.json" schema_trf = "newick" configfi = "tests/data/test.config" sys.stdout.write("\nTesting 'generate_ATT_from_files (fromfile.py)'\n") conf = ConfigObj(configfi, interactive=False) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi) data_obj == True
def test_reconcile(): #------------------------ seqaln = "tests/data/tiny_test_example/test.fas" seqalnmiss = "tests/data/tiny_test_example/test_missingseq.fas" mattype = "fasta" treefile = "tests/data/tiny_test_example/test.tre" treefilemiss = "tests/data/tiny_test_example/test_missingtip.tre" schema_trf = "newick" workdir = "tests/output/owndata" configfi = "example.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "tests/data/tmp/owndata/otu_dict.json".format(workdir) conf = ConfigObj(configfi, interactive=False) data_obj = generate_ATT_from_files(seqaln=seqalnmiss, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefile, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for otu in data_obj.otu_dict: if data_obj.otu_dict[otu][u'^ot:originalLabel'] == '2029_doronicum': assert data_obj.otu_dict[otu][ '^physcraper:status'] == "deleted in reconciliation" #---------------------------------------------------- data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefilemiss, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for otu in data_obj.otu_dict: if data_obj.otu_dict[otu][u'^ot:originalLabel'] == 'S_scopolii': assert data_obj.otu_dict[otu][ '^physcraper:status'] == "deleted in reconciliation" #---------------------------------------------------- aln = DnaCharacterMatrix.get(path=seqalnmiss, schema=mattype) assert aln.taxon_namespace for tax in aln.taxon_namespace: tax.label = tax.label.replace( " ", "_") # Forcing all spaces to underscore UGH tre = Tree.get(path=treefile, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) assert aln.taxon_namespace == tre.taxon_namespace assert aln.taxon_namespace is tre.taxon_namespace treed_taxa = set() for leaf in tre.leaf_nodes(): treed_taxa.add(leaf.taxon) aln_tax = set() for tax, seq in aln.items(): aln_tax.add(tax) prune = treed_taxa ^ aln_tax assert len(prune) == 1 assert list(prune)[0].label == '2029_doronicum' #---------------- aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype) assert aln.taxon_namespace for tax in aln.taxon_namespace: tax.label = tax.label.replace( " ", "_") # Forcing all spaces to underscore UGH tre = Tree.get(path=treefilemiss, schema="newick", preserve_underscores=True, taxon_namespace=aln.taxon_namespace) assert aln.taxon_namespace == tre.taxon_namespace assert aln.taxon_namespace is tre.taxon_namespace treed_taxa = set() for leaf in tre.leaf_nodes(): treed_taxa.add(leaf.taxon) aln_tax = set() for tax, seq in aln.items(): aln_tax.add(tax) prune = treed_taxa ^ aln_tax assert len(prune) == 1 assert list(prune)[0].label == 'S_scopolii' # ---------------------------- seqaln = "tests/data/tiny_test_example/test.fas" seqalnmiss = "tests/data/tiny_test_example/test_missingseq.fas" mattype = "fasta" treefile = "tests/data/tiny_test_example/test.tre" treefilemiss = "tests/data/tiny_test_example/test_missingtip.tre" schema_trf = "newick" workdir = "tests/output/owndata" configfi = "example.config" id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv" otu_jsonfi = "tests/data/tmp/owndata/otu_dict.json".format(workdir) data_obj = generate_ATT_from_files(seqaln=seqalnmiss, mattype=mattype, workdir=workdir, config_obj=conf, treefile=treefilemiss, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=None) for otu in data_obj.otu_dict: if data_obj.otu_dict[otu][u'^ot:originalLabel'] == '2029_doronicum': assert data_obj.otu_dict[otu][ '^physcraper:status'] == "deleted in reconciliation" for otu in data_obj.otu_dict: if data_obj.otu_dict[otu][u'^ot:originalLabel'] == 'S_scopolii': assert data_obj.otu_dict[otu][ '^physcraper:status'] == "deleted in reconciliation"
os.makedirs("{}".format(workdir)) conf = ConfigObj(configfi, interactive=True) ids = IdDicts(conf, workdir=workdir) otu_json = OtuJsonDict(id_to_spn, ids) with open(otu_jsonfi, "w") as outfile: json.dump(otu_json, outfile) ottids = [otu_json[ite]['^ot:ottId'] for ite in otu_json] mrca = opentree_helpers.get_mrca_ott(ottids) data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, config_obj=conf, treefile=trfn, schema_trf=schema_trf, otu_json=otu_jsonfi, ingroup_mrca=mrca) data_obj.prune_short() data_obj.dump(filename="tests/data/precooked/tiny_dataobj.p") scraper = PhyscraperScrape(data_obj, ids) scraper._blasted = 1 scraper.read_blast_wrapper( blast_dir="tests/data/precooked/fixed/tte_blast_files") pickle.dump(ids.acc_ncbi_dict, open("tests/data/precooked/tiny_acc_map.p", "wb"))
def run_with_settings(settings): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output.""" debug("Debugging mode is on") if os.path.isfile("{}/scrape_checkpoint.p".format(settings.workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") filteredScrape = pickle.load( open("{}/scrape_checkpoint.p".format(settings.workdir), "rb") ) filteredScrape.repeat = 1 else: conf = ConfigObj(settings.configfi) # print("config") debug(dir(conf)) debug(conf.email) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=settings.seqaln, mattype=settings.mattype, workdir=settings.workdir, treefile=settings.trfn, schema_trf=settings.schema_trf, otu_json=settings.spInfoDict, ingroup_mrca=None) # Prune sequences below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) data_obj.write_otus("otu_info", schema="table") data_obj.dump() ids = IdDicts(conf, workdir=settings.workdir) filteredScrape = FilterBlast(data_obj, ids, settings) filteredScrape.add_setting_to_self(settings.downtorank, settings.threshold) filteredScrape.write_otu_info(settings.downtorank) if settings.add_unpubl_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data sys.stdout.write("Blasting against local unpublished data") filteredScrape.write_unpubl_blastdb(settings.add_unpubl_seq) filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.local_otu_json = settings.id_to_spn_addseq_json filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() filteredScrape.unpublished = False # run the ananlyses if filteredScrape.unpublished is not True: filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder) filteredScrape.remove_identical_seqs() filteredScrape.dump() if settings.threshold is not None: filteredScrape.sp_dict(settings.downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby) filteredScrape.replace_new_seq() debug("from replace to streamed aln") filteredScrape.generate_streamed_alignment() filteredScrape.dump() while filteredScrape.repeat is 1: filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) filteredScrape.data.write_otus("otu_info", schema="table") filteredScrape.run_blast_wrapper(settings.delay) filteredScrape.read_blast_wrapper(blast_dir=settings.shared_blast_folder) filteredScrape.remove_identical_seqs() if settings.threshold is not None: filteredScrape.sp_dict(settings.downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=settings.threshold, selectby=settings.selectby) filteredScrape.replace_new_seq() filteredScrape.generate_streamed_alignment() filteredScrape.dump() filteredScrape.write_otu_info(settings.downtorank) return filteredScrape
def filter_data_run(seqaln, mattype, trfn, schema_trf, workdir, threshold, spInfoDict, configfi, selectby="blast", downtorank=None, blacklist=None, add_unpubl_seq=None, id_to_spn_addseq_json=None, ingroup_mrca=None, shared_blast_folder=None): """looks for pickeled file to continue run, or builds and runs new analysis for as long as new seqs are found. This uses the FilterBlast subclass to be able to filter the blast output. """ debug("Debugging mode is on") # debug(shared_blast_folder) # debug(some) # if _DEBUG_MK == 1: # random.seed(3269235691) print(workdir) if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: scrape\n") filteredScrape = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), 'rb')) filteredScrape.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=True) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, treefile=trfn, schema_trf=schema_trf, otu_json=spInfoDict, ingroup_mrca=ingroup_mrca) # Prune sequnces below a certain length threshold # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems. data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) data_obj.write_otus("otu_info", schema="table") data_obj.dump() sys.stdout.write("setting up id dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca) # Now combine the data, the ids, and the configuration into a single physcraper scrape object filteredScrape = FilterBlast(data_obj, ids) filteredScrape.add_setting_to_self(downtorank, threshold) filteredScrape.blacklist = blacklist if add_unpubl_seq is not None: filteredScrape.unpublished = True if filteredScrape.unpublished is True: # use unpublished data sys.stdout.write("Blasting against local unpublished data") filteredScrape.unpublished = True filteredScrape.write_unpubl_blastdb(add_unpubl_seq) filteredScrape.run_blast_wrapper(delay=14) print("add unpubl otu json") filteredScrape.data.unpubl_otu_json = id_to_spn_addseq_json print(filteredScrape.data.unpubl_otu_json) filteredScrape.read_blast_wrapper() filteredScrape.remove_identical_seqs() filteredScrape.generate_streamed_alignment() filteredScrape.unpublished = False else: # run the analysis sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None filteredScrape.run_blast_wrapper(delay=14) filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() filteredScrape.dump() sys.stdout.write("Filter the sequences\n") if threshold is not None: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby) filteredScrape.replace_new_seq() sys.stdout.write("Calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.data.write_otus("otu_info", schema="table") filteredScrape.write_otu_info(downtorank) filteredScrape.dump() while filteredScrape.repeat == 1: filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True) filteredScrape.data.write_otus("otu_info", schema="table") sys.stdout.write("BLASTing input sequences\n") if shared_blast_folder: filteredScrape.blast_subdir = shared_blast_folder else: shared_blast_folder = None filteredScrape.run_blast_wrapper(delay=14) filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder) filteredScrape.remove_identical_seqs() sys.stdout.write("Filter the sequences\n") if threshold is not None: filteredScrape.sp_dict(downtorank) filteredScrape.make_sp_seq_dict() filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby) filteredScrape.replace_new_seq() filteredScrape.data.prune_short(0.75) sys.stdout.write("calculate the phylogeny\n") filteredScrape.generate_streamed_alignment() filteredScrape.dump() filteredScrape.write_otu_info(downtorank) # print(some) filteredScrape.write_otu_info(downtorank) return filteredScrape
def own_data_run(seqaln, mattype, trfn, schema_trf, workdir, sp_info_jsonfi, configfi, ingroup_mrca=None, shared_blast_folder=None): """This is the wrapper function to start a PhyScraper run with your own data. You need: seqaln = path to sequence alignment file mattype = the format name of you alignment trfn = path to file with the phylogeny to update schema_trf = format type of your phylogeny workdir = define where your analysis files shall be stored sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function (usually, just leave it like it is in the example scripts.). configfi = path to your config file ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here, can be obtained bu running: python scripts/get_ott.py ingroup_name shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation), give the path to the folder with the shared runs. """ debug("Debugging mode is on") if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)): sys.stdout.write("Reloading from pickled scrapefile: ATT\n") scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb")) scraper.repeat = 1 else: sys.stdout.write("setting up Data Object\n") sys.stdout.flush() # read the config file into a configuration object conf = ConfigObj(configfi, interactive=False) # Generate an linked Alignment-Tree-Taxa object data_obj = generate_ATT_from_files(seqaln=seqaln, mattype=mattype, workdir=workdir, treefile=trfn, schema_trf=schema_trf, otu_json=sp_info_jsonfi, ingroup_mrca=ingroup_mrca) # Prune sequences below a certain length threshold data_obj.prune_short() data_obj.write_files() data_obj.write_labelled(label="^ot:ottTaxonName") data_obj.write_otus("otu_info", schema="table") data_obj.dump() sys.stdout.write("setting up ID dictionaries\n") sys.stdout.flush() ids = IdDicts(conf, workdir=workdir) scraper = PhyscraperScrape(data_obj, ids) if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None # run the analyses scraper.run_blast_wrapper(delay=14) scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() while scraper.repeat == 1: scraper.run_blast_wrapper(delay=14) if shared_blast_folder: scraper.blast_subdir = shared_blast_folder else: shared_blast_folder = None scraper.read_blast_wrapper(blast_dir=shared_blast_folder) scraper.remove_identical_seqs() scraper.generate_streamed_alignment() return 1