Ejemplo n.º 1
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi):
    if os.path.isfile("{}/scrape.p".format(workdir)): 
        sys.stdout.write("Readloading from pickled scrapefile")
        scraper = pickle.load(open("{}/scrape.p".format(workdir),'rb'))
        scraper.repeat = 1
    else: 
            sys.stdout.write("setting up Data Object\n")
            sys.stdout.flush()
            #read the config file into a configuration object
            conf = ConfigObj(configfi)
            aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)

            #Generate an linked Alignment-Tree-Taxa object
            data_obj = generate_ATT_from_phylesystem(aln=aln,
                                 workdir=workdir,
                                 study_id = study_id,
                                 tree_id = tree_id,
                                 phylesystem_loc = conf.phylesystem_loc)




            #Prune sequnces below a certain length threshold
            #This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
            data_obj.prune_short()

            data_obj.write_files()
            data_obj.write_labelled()


            #Mapping identifiers between OpenTree and NCBI requires and identifier dict object
            ids = IdDicts(conf, workdir="example")


            #Now combine the data, the ids, and the configuration into a single physcraper scrape object
            scraper =  PhyscraperScrape(data_obj, ids, conf)
            #run the ananlyses
            scraper.run_blast()
            scraper.read_blast()
            scraper.remove_identical_seqs()
            scraper.generate_streamed_alignment()
    while scraper.repeat == 1: 
        scraper.run_blast()
        scraper.read_blast()
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
Ejemplo n.º 2
0
def test_0():
    if os.path.isfile("tests/data/precooked/otol_scraper.p"):
        # physcraper.debug(os.getcwd())
        conf = physcraper.ConfigObj(configfi, interactive=False)
        # physcraper.debug("conf")
        conf.unmapped = 'keep'
        # physcraper.debug("set unmapped")
        data_obj = pickle.load(
            open("tests/data/precooked/otol_tiny_dataobj.p", 'rb'))
        data_obj.workdir = absworkdir
        # physcraper.debug("dataobj loaded")
        ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
        ids.acc_ncbi_dict = pickle.load(
            open("tests/data/precooked/otol_tiny_gi_map.p", "rb"))
        # physcraper.debug("ids loaded")
        scraper = pickle.load(open("tests/data/precooked/otol_scraper.p",
                                   "rb"))
        # physcraper.debug("scraper loaded")
        # scraper2 = pickle.load(open("tests/data/precooked/otol_scraper.p", "rb"))
        num_keep = len(scraper.data.aln.taxon_namespace)
        # physcraper.debug('num_keep')

        # physcraper.debug(num_keep)
    # except:
    else:
        sys.stdout.write("\n\n No files present\n\n")
        conf = physcraper.ConfigObj(configfi)
        conf.unmapped = 'keep'
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        data_obj = physcraper.generate_ATT_from_phylesystem(
            aln=aln,
            workdir=workdir,
            study_id=study_id,
            tree_id=tree_id,
            phylesystem_loc=conf.phylesystem_loc)
        # physcraper.debug(len(data_obj.aln.taxon_namespace))
        pickle.dump(data_obj,
                    open("tests/data/precooked/otol_tiny_dataobj.p", "wb"))
        ids = physcraper.IdDicts(conf, workdir=workdir)
        # physcraper.debug(os.getcwd())
        pickle.dump(ids.acc_ncbi_dict,
                    open("tests/data/precooked/otol_tiny_gi_map.p", "wb"))
        data_obj.write_files()
        scraper = physcraper.PhyscraperScrape(data_obj, ids)
        # physcraper.debug(len(scraper.data.aln.taxon_namespace))
        # physcraper.debug("scraper obj made")
        pickle.dump(scraper.config,
                    open("tests/data/precooked/otol_conf.p", "wb"))
        pickle.dump(scraper, open("tests/data/precooked/otol_scraper.p", "wb"))
        num_keep = len(scraper.data.aln.taxon_namespace)
Ejemplo n.º 3
0
def test_generate_ATT_from_phylesystem():
    seqaln = "tests/data/input.fas"
    study_id = "pg_873"
    tree_id = "tree1679"
    seqaln = "tests/data/minitest.fas"
    mattype = "fasta"
    workdir = "tests/output/opentree"
    configfi = "tests/data/remotencbi.config"

    sys.stdout.write("\nTesting 'generate_ATT_from_files (fromfile.py)'\n")

    conf = physcraper.ConfigObj(configfi, interactive=False)
    aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)

    data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln,
                                                        workdir=workdir,
                                                        config_obj=conf,
                                                        study_id=study_id,
                                                        tree_id=tree_id)

    data_obj == True
Ejemplo n.º 4
0
def test_opentree():
    # Use OpenTree phylesystem identifiers to get study and tree
    study_id = "pg_873"
    tree_id = "tree1679"
    seqaln = "tests/data/minitest.fas"
    mattype = "fasta"
    workdir = "tests/output/opentree"
    configfi = "tests/data/remotencbi.config"

    sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n")
    conf = physcraper.ConfigObj(configfi, interactive=False)
    # print "1. {}".format(conf.email)

    aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
    data_obj = physcraper.generate_ATT_from_phylesystem(
        aln=aln,
        workdir=workdir,
        config_obj=conf,
        study_id=study_id,
        tree_id=tree_id,
        phylesystem_loc=conf.phylesystem_loc)
    assert isinstance(data_obj, AlignTreeTax)
Ejemplo n.º 5
0
tree_id = "tree1679"
seqaln = "tests/data/minitest.fas"
mattype = "fasta"
workdir = "tests/output/opentree"
configfi = "tests/data/remotencbi.config"


sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n")
conf = physcraper.ConfigObj(configfi, interactive=False)
print "1. {}".format(conf.email)
      
    
aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln,
                                                    workdir=workdir,
                                                    study_id = study_id,
                                                    tree_id = tree_id,
                                                    phylesystem_loc = conf.phylesystem_loc)



ids =  physcraper.IdDicts(conf, workdir=workdir)


print "3. {}".format(ids.config.email)


data_obj.prune_short()
assert len(data_obj.aln) == 9
data_obj.write_files()
try:
Ejemplo n.º 6
0
    aln = dataset.char_matrices[0]

#Write it out to file, os we have the 'before' alignment
aln.write(path="{}{}.aln".format(study_id, tree_id), schema="nexus")

# If we are using an alinment we already wrote  to file earlier we can use this
#aln = dendropy.DnaCharacterMatrix.get(file=open("{}{}.aln".format(study_id, tree_id)), schema="nexus", taxon_namespace=tre.taxon_namespace)

tre.write(path="{}{}.tre".format(study_id, tree_id), schema="nexus")

# To preserve taxon labels and relationships,
#we will combine the alignement, tree and taxon information into a single data object
# By using the OpenTree Phylesystem API we can get the orgininal taxon names as well as the taxon mappings
data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln,
                                                    workdir=workdir,
                                                    config_obj=conf,
                                                    study_id=study_id,
                                                    tree_id=tree_id)

#data_obj.write_files()
#json.dump(data_obj.otu_dict, open('{}/otu_dict.json'.format(workdir), 'wb'))

sys.stdout.write("{} taxa in alignement and tree\n".format(len(data_obj.aln)))

# We need to create a physcraper ids object to translate between ncbi and OpenTree identifiers.
ids = physcraper.IdDicts(conf, workdir=workdir)

# Create an 'scraper' object to get data from NCBI, align it an
scraper = physcraper.PhyscraperScrape(data_obj, ids)

#scraper.read_blast_wrapper()
Ejemplo n.º 7
0
from physcraper import get_dataset_from_treebase, generate_ATT_from_phylesystem, ConfigObj, IdDicts, PhyscraperScrape
import pickle
import sys
import os

study_id = "pg_873"
tree_id = "tree1679"
configfi = "tests/data/remotencbi.config"

conf = ConfigObj(configfi)

dataset = get_dataset_from_treebase(study_id, phylesystem_loc='api')

aln = dataset.char_matrices[0]

data_obj = generate_ATT_from_phylesystem(aln=aln,
                                         workdir='tests/output/treebase',
                                         config_obj=conf,
                                         study_id=study_id,
                                         tree_id=tree_id)
Ejemplo n.º 8
0
study_id = "pg_873"
tree_id = "tree1679"
seqaln = "tests/data/minitest.fas"
mattype="fasta"



configfi = "tests/local.config"

conf = ConfigObj(configfi)

aln = dendropy.DnaCharacterMatrix.get(file=open(seqaln), schema=mattype)

data_obj = generate_ATT_from_phylesystem(aln,
                     "tmp",
                     study_id = study_id,
                     tree_id = tree_id,
                     phylesystem_loc = conf.phylesystem_loc)


data_obj.prune_short()
data_obj.write_files()
data_obj.write_labelled()


ids = IdDicts(conf, "tmp")

scraper =  PhyscraperScrape(data_obj, ids, conf)
scraper.run_blast()
scraper.read_blast()
scraper.remove_identical_seqs()
Ejemplo n.º 9
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """looks for a json file to continue run, or builds and runs
    new analysis for as long as new seqs are found

    This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life.
    You need:
         seqaln = ID of alignment file
         mattype = the format name of you alignment
         trfn = Id of phylogeny to update
         workdir = define where your analysis files shall be stored
         configfi = path to your config file
         ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest

         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    debug("Debugging mode is on")

    conf = ConfigObj(configfi, interactive=False)
    if os.path.isfile("{}/att_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading data object from pickle file\n")
        data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb"))
#        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_phylesystem(aln=aln,
                                                 workdir=workdir,
                                                 study_id=study_id,
                                                 tree_id=tree_id,
                                                 phylesystem_loc=conf.phylesystem_loc,
                                                 ingroup_mrca=ingroup_mrca)
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
        # ids = IdDicts(conf, workdir="example")
        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
    if os.path.isfile(conf.id_pickle):
        sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle))
        ids = pickle.load(open(conf.id_pickle, "rb"))
    else:
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        ids.dump()
    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    scraper = PhyscraperScrape(data_obj, ids)
    # run the analyses
    if shared_blast_folder:
        scraper.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    scraper.run_blast_wrapper(delay=14)
    scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
    scraper.remove_identical_seqs()
    scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    # scraper.write_otu_info()

    return scraper
Ejemplo n.º 10
0
def filter_OTOL(study_id,
                tree_id,
                seqaln,
                workdir,
                configfi,
                threshold,
                selectby="blast",
                downtorank=None,
                blacklist=None,
                add_unpubl_seq=None,  # path to local seq
                id_to_spn_addseq_json=None,
                ingroup_mrca=None,
                shared_blast_folder=None):
    """looks for pickeled file to continue run, or builds and runs
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output."""
    debug("Debugging mode is on")
    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        filteredScrape = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), 'rb'))
        filteredScrape.repeat = 1   
    else:   
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=True)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_phylesystem(seqaln,
                                                 workdir,
                                                 study_id,
                                                 tree_id,
                                                 phylesystem_loc='api',
                                                 ingroup_mrca=ingroup_mrca)
        # Prune sequnces below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()

        ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca)

        # Now combine the data, the ids, and the configuration into a single physcraper scrape object
        filteredScrape = FilterBlast(data_obj, ids)
        filteredScrape.add_setting_to_self(downtorank, threshold)

        filteredScrape.blacklist = blacklist
        if add_unpubl_seq is not None:
            filteredScrape.unpublished = True
        if filteredScrape.unpublished is True:  # use unpublished data
            sys.stdout.write("Blasting against local unpublished data")
            filteredScrape.unpublished = True
            filteredScrape.write_unpubl_blastdb(add_unpubl_seq)
            filteredScrape.run_blast_wrapper(delay=14)
            filteredScrape.data.local_otu_json = id_to_spn_addseq_json
            filteredScrape.read_blast_wrapper()
            filteredScrape.remove_identical_seqs()
            filteredScrape.generate_streamed_alignment()
            filteredScrape.unpublished = False
        else:
            sys.stdout.write("BLASTing input sequences\n")
            filteredScrape.run_blast_wrapper(delay=14)
            filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
            filteredScrape.remove_identical_seqs()
            filteredScrape.dump()
            if threshold is not None:
                filteredScrape.sp_dict(downtorank)
                filteredScrape.make_sp_seq_dict()
                filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby)
                filteredScrape.replace_new_seq()
            sys.stdout.write("calculate the phylogeny\n")
            filteredScrape.generate_streamed_alignment()
            filteredScrape.dump()
    while filteredScrape.repeat == 1:
        filteredScrape.data.write_labelled(label="^ot:ottTaxonName", add_gb_id=True)
        filteredScrape.data.write_otus("otu_info", schema="table")
        sys.stdout.write("BLASTing input sequences\n")
        filteredScrape.run_blast_wrapper(delay=14)
        filteredScrape.read_blast_wrapper(blast_dir=shared_blast_folder)
        filteredScrape.remove_identical_seqs()
        sys.stdout.write("Filter the sequences\n")
        if threshold is not None:
            filteredScrape.sp_dict(downtorank)
            filteredScrape.make_sp_seq_dict()
            filteredScrape.how_many_sp_to_keep(threshold=threshold, selectby=selectby)
            filteredScrape.replace_new_seq()
        filteredScrape.data.prune_short(0.75)
        sys.stdout.write("calculate the phylogeny\n")
        filteredScrape.generate_streamed_alignment()
        filteredScrape.dump()
        filteredScrape.write_otu_info(downtorank)
        return filteredScrape