Ejemplo n.º 1
0
def PS_standard_run(data_obj, ids, shared_blast_folder):
    """
    This is the standard mode for a Physcraper run:
    update aln and tre as long as new seqs are found, no filtering.

    :param data_obj: ATT object
    :param ids: IdDict object
    :param shared_blast_folder: path to folder for shared blast runs
    :return: PS run
    """
    if os.path.isfile("{}/scrape_checkpoint.p".format(data_obj.workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        scraper = pickle.load(
            open("{}/scrape_checkpoint.p".format(data_obj.workdir), 'rb'))
        scraper.repeat = 1
    else:
        scraper = PhyscraperScrape(data_obj, ids, ingroup_mrca)
        # run the analyses
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper()
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
        scraper.dump("scrape_checkpoint.p")
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper()
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
        scraper.dump()
        write_out_files(scraper)
    writeinfofiles.get_additional_GB_info(scraper)
    return scraper
Ejemplo n.º 2
0
def test_filter_length():

    workdir = "tests/output/test_selectbylength"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)
    threshold = 2
    selectby = "length"
    downtorank = "species"
    add_unpubl_seq = None
    blacklist = None

    id_to_spn_addseq_json = None
    ingroup_mrca = None
    shared_blast_folder = None

    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist

    sys.stdout.write("BLASTing input sequences\n")
    if shared_blast_folder:
        filteredScrape.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    # filteredScrape.run_blast_wrapper()
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.remove_identical_seqs()
    filteredScrape.dump()
    sys.stdout.write("Filter the sequences\n")
    length_unfiltered = len(filteredScrape.new_seqs_otu_id)

    #    if threshold is not None:
    #        filteredScrape.filter_seqs()

    length_filtered = len(filteredScrape.new_seqs)
Ejemplo n.º 3
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """looks for a json file to continue run, or builds and runs
    new analysis for as long as new seqs are found

    This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life.
    You need:
         seqaln = ID of alignment file
         mattype = the format name of you alignment
         trfn = Id of phylogeny to update
         workdir = define where your analysis files shall be stored
         configfi = path to your config file
         ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest

         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    debug("Debugging mode is on")

    conf = ConfigObj(configfi, interactive=False)
    if os.path.isfile("{}/att_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading data object from pickle file\n")
        data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb"))
#        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_phylesystem(aln=aln,
                                                 workdir=workdir,
                                                 study_id=study_id,
                                                 tree_id=tree_id,
                                                 phylesystem_loc=conf.phylesystem_loc,
                                                 ingroup_mrca=ingroup_mrca)
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
        # ids = IdDicts(conf, workdir="example")
        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
    if os.path.isfile(conf.id_pickle):
        sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle))
        ids = pickle.load(open(conf.id_pickle, "rb"))
    else:
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        ids.dump()
    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    scraper = PhyscraperScrape(data_obj, ids)
    # run the analyses
    if shared_blast_folder:
        scraper.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    scraper.run_blast_wrapper(delay=14)
    scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
    scraper.remove_identical_seqs()
    scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    # scraper.write_otu_info()

    return scraper
Ejemplo n.º 4
0
def own_data_run(seqaln,
                 mattype,
                 trfn,
                 schema_trf,
                 workdir,
                 sp_info_jsonfi,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """This is the wrapper function to start a PhyScraper run with your own data.
    You need:
         seqaln = path to sequence alignment file
         mattype = the format name of you alignment
         trfn = path to file with the phylogeny to update
         schema_trf = format type of your phylogeny
         workdir = define where your analysis files shall be stored
         sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function
                            (usually, just leave it like it is in the example scripts.).
         configfi = path to your config file
         ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here,
                        can be obtained bu running: python scripts/get_ott.py ingroup_name
         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """

    debug("Debugging mode is on")

    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: ATT\n")
        scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb"))
        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                           mattype=mattype,
                                           workdir=workdir,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=sp_info_jsonfi,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequences below a certain length threshold
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        sys.stdout.write("setting up ID dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        scraper = PhyscraperScrape(data_obj, ids)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        # run the analyses
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.run_blast_wrapper(delay=14)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    return 1