Ejemplo n.º 1
0
def test_no_mrca():
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    workdir = "tests/output/test_mrcalist_local"
    configfi = "tests/data/test.config"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    ingroup_mrca = None
    # setup the run
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    # print(ids.mrca_ott, ids.mrca_ncbi)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    filteredScrape = PhyscraperScrape(data_obj, ids, ingroup_mrca)
    filteredScrape.threshold = 5
    assert filteredScrape.mrca_ncbi == 18794
    
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    filteredScrape._blasted = 1
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    assert len(filteredScrape.new_seqs_otu_id) in [23,17] #Blurghhh, local vs remote searches get diffenrt number of seqs!
Ejemplo n.º 2
0
def test_remove_taxa_aln_tre():
    conf = ConfigObj(configfi, interactive=False)

    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape =  PhyscraperScrape(data_obj, ids)

    len_aln_before = len(filteredScrape.data.aln.as_string('phylip'))
    len_tre_before = len(filteredScrape.data.tre.as_string(schema="newick"))
    namespace_before = len(filteredScrape.data.aln.taxon_namespace)
    namespace_tre_before = len(filteredScrape.data.tre.taxon_namespace)

    for tax in filteredScrape.data.aln.taxon_namespace:
        filteredScrape.data.remove_taxa_aln_tre(tax.label)
        break

    len_aln_after = len(filteredScrape.data.aln.as_string('phylip'))
    len_tre_after = len(filteredScrape.data.tre.as_string(schema="newick"))
    namespace_after = len(filteredScrape.data.aln.taxon_namespace)
    namespace_tre_after = len(filteredScrape.data.tre.taxon_namespace)

    assert len_aln_before != len_aln_after
    assert len_tre_before != len_tre_after
    assert namespace_before != namespace_after
    assert namespace_tre_before != namespace_tre_after
Ejemplo n.º 3
0
def test_run_raxml():

    workdir = "tests/output/test_run_raxml"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)

    #load data
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    scraper = PhyscraperScrape(data_obj, ids)
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    scraper._blasted = 1

    # run needed functions
    # scraper.run_blast_wrapper()
    scraper.read_blast_wrapper(blast_dir=blast_dir)

    #	scraper.align_query_seqs()

    #	scraper.place_query_seqs()

    scraper.est_full_tree()
    # scraper.generate_streamed_alignment()
    assert os.path.exists("{}/RAxML_bestTree.{}".format(
        scraper.workdir, scraper.date))
Ejemplo n.º 4
0
def test_filter_length():

    workdir = "tests/output/test_selectbylength"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)
    threshold = 2
    selectby = "length"
    downtorank = "species"
    add_unpubl_seq = None
    blacklist = None

    id_to_spn_addseq_json = None
    ingroup_mrca = None
    shared_blast_folder = None

    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist

    sys.stdout.write("BLASTing input sequences\n")
    if shared_blast_folder:
        filteredScrape.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    # filteredScrape.run_blast_wrapper()
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.remove_identical_seqs()
    filteredScrape.dump()
    sys.stdout.write("Filter the sequences\n")
    length_unfiltered = len(filteredScrape.new_seqs_otu_id)

    #    if threshold is not None:
    #        filteredScrape.filter_seqs()

    length_filtered = len(filteredScrape.new_seqs)
Ejemplo n.º 5
0
def test_add_all():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    filteredScrape.threshold = threshold
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    filteredScrape.remove_identical_seqs()
    sp_d = filteredScrape.make_sp_dict(filteredScrape.new_seqs_otu_id)
    assert len(sp_d) == 5
    for taxon in sp_d:
        assert len(sp_d[taxon]) <= threshold
Ejemplo n.º 6
0
def test_write_outputinfo():
    workdir = "tests/output/test_write_output_files"
    configfi = "tests/data/test.config"
    downtorank = None
    absworkdir = os.path.abspath(workdir)

    fn_otu = os.path.join(absworkdir, "otu_seq_info.csv")
    fn_sampling = os.path.join(absworkdir, "taxon_sampling.csv")

    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    # filteredScrape.sp_dict(downtorank)
    # filteredScrape.make_sp_seq_dict()

    filteredScrape.align_query_seqs()

    wrappers.write_out_files(filteredScrape, downtorank)

    with open(fn_otu) as fn:
        line = fn.readline()
        cnt = 1
        while cnt <= 5:
            line = fn.readline()
            cnt += 1
            assert type(line) == str
            assert line.split(",") >= 2

    with open(fn_sampling) as fn:
        line = fn.readline()
        cnt = 1
        while cnt <= 5:
            line = fn.readline()
            cnt += 1
            assert type(line) == str
            assert line.split(",") >= 2
def test_remove_identical_seqs():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir

    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    # print("start")
    scraper = PhyscraperScrape(data_obj, ids)
    scraper.ids.otu_rank = {}
    scraper.config.gifilename = False
    scraper._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper.read_blast_wrapper(blast_dir=blast_dir)
    #print scraper.ncbi_mrca

    assert (len(scraper.new_seqs) == 0)
    assert (len(scraper.data.aln) == 5)
    assert len(scraper.new_seqs_otu_id) == 17
    #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test.

    #TODO find an example where we do get identical sequences and need to discard them

    #    seqset = set()
    #    for otu in scraper.new_seqs_otu_id:
    #        seq = scraper.new_seqs_otu_id[otu]
    #        if seq in seqset:
    #            print otu
    #        seqset.add(seq)

    #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence.
    ##    for otu in scraper.new_seqs_otu_id:
    #       qseq = scraper.new_seqs_otu_id[otu]
    #       count = 0
    #       for seq in seqset:
    #           if qseq in seq:
    #               count += 1
    #       assert count == 1

    ##    for taxon in scraper.data.tre.taxon_namespace:
    #       assert(taxon.label in scraper.data.otu_dict)
    #       status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
    #       assert(status in ('original', 'query'))

    aln_path1 = scraper.data.write_aln()
    aln_path = scraper.write_all_unaligned('test.fas')
    scraper.align_query_seqs()
    assert len(scraper.data.aln) == 22
Ejemplo n.º 8
0
def test_sp_d():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
Ejemplo n.º 9
0
def test_internal_mpi():
    import pickle
    import sys
    import os
    import subprocess
    from physcraper import ConfigObj, PhyscraperScrape, IdDicts
    from mpi4py import MPI

    # set up until test
    workdir = "tests/output/test_mpi_raxml"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)

    #load data
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    scraper = PhyscraperScrape(data_obj, ids)
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    scraper._blasted = 1

    # run needed functions
    scraper.read_blast_wrapper(blast_dir=blast_dir)
    scraper.remove_identical_seqs()

    scraper.data.write_papara_files()
    scraper.align_query_seqs()
    scraper.place_query_seqs()
    scraper.est_full_tree()

    # scraper.generate_streamed_alignment()
    assert os.path.exists("{}/RAxML_bestTree.{}".format(
        scraper.workdir, scraper.date))
    # scraper.generate_streamed_alignment()
    if not os.path.exists("{}/previous_run".format(scraper.workdir)):
        os.mkdir("{}/previous_run".format(scraper.workdir))
    os.system(
        "mv {}/papara_alignment.extended  {}/previous_run/papara_alignment.extended"
        .format(scraper.workdir, scraper.workdir))

    cwd = os.getcwd()
    # os.chdir(scraper.workdir)

    ntasks = os.environ.get('SLURM_NTASKS_PER_NODE')
    nnodes = os.environ.get("SLURM_JOB_NUM_NODES")
    print(nnodes, ntasks)
    env_var = int(nnodes) * int(ntasks)
    #env_var = os.environ.get('SLURM_JOB_CPUS_PER_NODE', 7)
    print(env_var)

    assert os.path.exists("{}/previous_run/papara_alignment.extended".format(
        scraper.workdir))
    with cd(scraper.workdir):
        print("run with mpi")
        subprocess.call([
            "mpiexec", "-n", "{}".format(env_var), "raxmlHPC-MPI-AVX2", "-m",
            "GTRCAT", "-s", "{}/previous_run/papara_alignment.extended".format(
                scraper.workdir), "-p", "1", "-f", "a", "-x", "1", "-#",
            "autoMRE", "-n", "all{}".format(scraper.date)
        ])
Ejemplo n.º 10
0
def test_read_local_blast():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    # print("prepare test")
    for taxonID in filteredScrape.sp_d:
        if len(filteredScrape.sp_seq_d[taxonID]) > treshold:
            # print(taxonID)
            blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0]
            seq = filteredScrape.sp_seq_d[taxonID][blast_seq]
            local_blast.write_filterblast_query(filteredScrape.workdir,
                                                taxonID,
                                                seq,
                                                fn=str(taxonID))
            # print(filteredScrape.sp_seq_d[taxonID].keys()[1:] )
            blast_db = [
                item for item in filteredScrape.sp_seq_d[taxonID].keys()[1:]
                if len(item.split(".")) >= 2
            ]
            # print(blast_db)
            for blast_key in blast_db:
                seq = filteredScrape.sp_seq_d[taxonID][blast_key]
                local_blast.write_filterblast_db(filteredScrape.workdir,
                                                 blast_key,
                                                 seq,
                                                 fn=str(taxonID))
            break

    # print(taxonID)

    blast_db = taxonID
    blast_seq = taxonID
    key = taxonID

    local_blast.run_filter_blast(filteredScrape.workdir, blast_seq, blast_db)
    local_blast.read_filter_blast(filteredScrape.workdir,
                                  filteredScrape.sp_seq_d[key], blast_db)

    blast_out = "{}/blast/output_{}_tobeblasted.xml".format(workdir, key)

    if os.path.exists(blast_out):
        with open(blast_out) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
Ejemplo n.º 11
0
data_obj = generate_ATT_from_phylesystem(aln,
                     "tmp",
                     study_id = study_id,
                     tree_id = tree_id,
                     phylesystem_loc = conf.phylesystem_loc)


data_obj.prune_short()
data_obj.write_files()
data_obj.write_labelled()


ids = IdDicts(conf, "tmp")

scraper =  PhyscraperScrape(data_obj, ids, conf)
scraper.run_blast()
scraper.read_blast()
scraper.remove_identical_seqs()
scraper.generate_streamed_alignment()

scraper.run_blast()
scraper.read_blast()
scraper.remove_identical_seqs()
scraper.generate_streamed_alignment()


#otu_json = "tests/minitest_otu.json"
#treefile = "tests/minitest.tre"
#info_obj2 = StudyInfo(seqaln,
#                     mattype,
Ejemplo n.º 12
0
def PS_standard_run(data_obj, ids, shared_blast_folder):
    """
    This is the standard mode for a Physcraper run:
    update aln and tre as long as new seqs are found, no filtering.

    :param data_obj: ATT object
    :param ids: IdDict object
    :param shared_blast_folder: path to folder for shared blast runs
    :return: PS run
    """
    if os.path.isfile("{}/scrape_checkpoint.p".format(data_obj.workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: scrape\n")
        scraper = pickle.load(
            open("{}/scrape_checkpoint.p".format(data_obj.workdir), 'rb'))
        scraper.repeat = 1
    else:
        scraper = PhyscraperScrape(data_obj, ids, ingroup_mrca)
        # run the analyses
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper()
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
        scraper.dump("scrape_checkpoint.p")
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper()
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
        scraper.dump()
        write_out_files(scraper)
    writeinfofiles.get_additional_GB_info(scraper)
    return scraper
Ejemplo n.º 13
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi):
    if os.path.isfile("{}/scrape.p".format(workdir)): 
        sys.stdout.write("Readloading from pickled scrapefile")
        scraper = pickle.load(open("{}/scrape.p".format(workdir),'rb'))
        scraper.repeat = 1
    else: 
            sys.stdout.write("setting up Data Object\n")
            sys.stdout.flush()
            #read the config file into a configuration object
            conf = ConfigObj(configfi)
            aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)

            #Generate an linked Alignment-Tree-Taxa object
            data_obj = generate_ATT_from_phylesystem(aln=aln,
                                 workdir=workdir,
                                 study_id = study_id,
                                 tree_id = tree_id,
                                 phylesystem_loc = conf.phylesystem_loc)




            #Prune sequnces below a certain length threshold
            #This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
            data_obj.prune_short()

            data_obj.write_files()
            data_obj.write_labelled()


            #Mapping identifiers between OpenTree and NCBI requires and identifier dict object
            ids = IdDicts(conf, workdir="example")


            #Now combine the data, the ids, and the configuration into a single physcraper scrape object
            scraper =  PhyscraperScrape(data_obj, ids, conf)
            #run the ananlyses
            scraper.run_blast()
            scraper.read_blast()
            scraper.remove_identical_seqs()
            scraper.generate_streamed_alignment()
    while scraper.repeat == 1: 
        scraper.run_blast()
        scraper.read_blast()
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
Ejemplo n.º 14
0
def test_remove_id_seq():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1

    #############################

    id_seq = [
        "TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC",
        "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC",
        "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC",
        "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC"
    ]

    # print("start test")
    tmp_dict = dict(
        (taxon.label, filteredScrape.data.aln[taxon].symbols_as_string())
        for taxon in filteredScrape.data.aln)
    old_seqs = tmp_dict.keys()
    avg_seqlen = sum(filteredScrape.data.orig_seqlen) / len(
        filteredScrape.data.orig_seqlen)
    assert filteredScrape.config.seq_len_perc <= 1
    seq_len_cutoff = avg_seqlen * filteredScrape.config.seq_len_perc
    count = 1

    for item in id_seq:
        if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff:
            ott = "OTT_{}".format(count)
            count += 1
            otu_id = ott
            filteredScrape.data.otu_dict[otu_id] = {}
            filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300
            filteredScrape.data.otu_dict[otu_id][
                '^ncbi:accession'] = "KX494441"
            filteredScrape.data.otu_dict[otu_id][
                '^ncbi:title'] = "some random title"
            filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101
            filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott
            filteredScrape.data.otu_dict[otu_id][
                '^physcraper:status'] = "query"
            filteredScrape.data.otu_dict[otu_id][
                '^ot:ottTaxonName'] = "Senecio vulgaris"
            filteredScrape.data.otu_dict[otu_id][
                '^physcraper:last_blasted'] = None
            filteredScrape.del_superseq = set()
            filteredScrape.seq_dict_build(item, otu_id, tmp_dict)
    for tax in old_seqs:
        try:
            del tmp_dict[tax]
        except KeyError:
            pass
    filteredScrape.new_seqs_otu_id = tmp_dict
    expected_add = 1
    assert expected_add == len(filteredScrape.new_seqs_otu_id)
    sys.stdout.write(
        "todo: add check that newly added seq are checked. they are, but there is no test"
    )
Ejemplo n.º 15
0
def test_blacklist():

    workdir = "tests/output/test_blacklist"
    configfi = "tests/data/test.config"

    # make one run without blacklist
    blacklist = None
    noblack = os.path.join(workdir, "noblacklist")
    absworkdir = os.path.abspath(noblack)
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))

    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    noblackScrape = PhyscraperScrape(data_obj, ids)
    noblackScrape._blasted = 1
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        # print(dest)
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    noblackScrape.read_blast_wrapper()
    noblackScrape.remove_identical_seqs()
    new_test_generate_streamed_aln(noblackScrape)

    # one run with blacklist

    blacklist = ['JX895340.1']
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist
    filteredScrape._blasted = 1
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper()
    filteredScrape.remove_identical_seqs()
    new_test_generate_streamed_aln(filteredScrape)


    print("RUN TESTS!")
    gi_l = []
    gi_l_2 = []
    for tax in filteredScrape.data.tre.taxon_namespace:
        gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l.append(gi_id)
    print(gi_l)
    for tax in noblackScrape.data.tre.taxon_namespace:
        # print(filteredScrape.data.otu_dict[tax.label])
        gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l_2.append(gi_id)
    print(gi_l_2)
    for item in blacklist:
        assert item not in gi_l
Ejemplo n.º 16
0
ids = IdDicts(conf, workdir=workdir)

otu_json = OtuJsonDict(id_to_spn, ids)
with open(otu_jsonfi, "w") as outfile:
    json.dump(otu_json, outfile)

ottids = [otu_json[ite]['^ot:ottId'] for ite in otu_json]
mrca = opentree_helpers.get_mrca_ott(ottids)

data_obj = generate_ATT_from_files(seqaln=seqaln,
                                   mattype=mattype,
                                   workdir=workdir,
                                   config_obj=conf,
                                   treefile=trfn,
                                   schema_trf=schema_trf,
                                   otu_json=otu_jsonfi,
                                   ingroup_mrca=mrca)

data_obj.prune_short()
data_obj.dump(filename="tests/data/precooked/tiny_dataobj.p")

scraper = PhyscraperScrape(data_obj, ids)

scraper._blasted = 1
scraper.read_blast_wrapper(
    blast_dir="tests/data/precooked/fixed/tte_blast_files")

pickle.dump(ids.acc_ncbi_dict, open("tests/data/precooked/tiny_acc_map.p",
                                    "wb"))
# pickle.dump(scraper.acc_list_mrca, open("tests/data/precooked/acc_list_mrca.p", "wb"))
Ejemplo n.º 17
0
def test_remove_identical_seqs():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    print("start")
    scraper = PhyscraperScrape(data_obj, ids)
    scraper.config.blast_loc = 'remote'
    scraper.ids.otu_rank = {}
    scraper.config.gifilename = False

    scraper._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper.read_blast_wrapper(blast_dir=blast_dir)

    a = len(scraper.new_seqs) == 40
    b = len(scraper.data.aln) == 5
    c = len(scraper.new_seqs_otu_id) == 0

    scraper.remove_identical_seqs()

    d = len(scraper.new_seqs) == 40
    e = len(scraper.data.aln) == 5
    f = len(scraper.new_seqs_otu_id) == 38
    g = 1
    for taxon in scraper.data.tre.taxon_namespace:
        h = taxon.label in scraper.data.otu_dict
        g = g * h
        status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
        i = status in ('original', 'query')
        g = g * i

    # Second test checks that seq len prec is affecting results
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p",
                                'rb'))  #reload bc data object is mutable
    data_obj.workdir = absworkdir
    scraper2 = PhyscraperScrape(data_obj, ids)
    scraper2.config.blast_loc = 'remote'
    scraper2.ids.otu_rank = {}

    scraper2.config.gifilename = False
    j = len(scraper2.data.aln) == 5
    # scraper2.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper2.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    scraper2.config.seq_len_perc = 0.998  # Change seq len percentage from default of 75%

    k = len(scraper2.new_seqs) == 40
    l = len(scraper2.new_seqs_otu_id) == 0

    scraper2.remove_identical_seqs()
    # print(scraper2.data.otu_dict)
    # print(len(scraper.new_seqs_otu_id), 38)
    # print(len(scraper2.new_seqs_otu_id), 36)
    m = len(scraper2.new_seqs_otu_id) == 36
    count = 0
    assert a * b * c * d * e * f * g * h * i * j * k * l * m == True
Ejemplo n.º 18
0
def own_data_run(seqaln,
                 mattype,
                 trfn,
                 schema_trf,
                 workdir,
                 sp_info_jsonfi,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """This is the wrapper function to start a PhyScraper run with your own data.
    You need:
         seqaln = path to sequence alignment file
         mattype = the format name of you alignment
         trfn = path to file with the phylogeny to update
         schema_trf = format type of your phylogeny
         workdir = define where your analysis files shall be stored
         sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function
                            (usually, just leave it like it is in the example scripts.).
         configfi = path to your config file
         ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here,
                        can be obtained bu running: python scripts/get_ott.py ingroup_name
         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """

    debug("Debugging mode is on")

    if os.path.isfile("{}/scrape_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading from pickled scrapefile: ATT\n")
        scraper = pickle.load(open("{}/scrape_checkpoint.p".format(workdir), "rb"))
        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                           mattype=mattype,
                                           workdir=workdir,
                                           treefile=trfn,
                                           schema_trf=schema_trf,
                                           otu_json=sp_info_jsonfi,
                                           ingroup_mrca=ingroup_mrca)

        # Prune sequences below a certain length threshold
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()

        sys.stdout.write("setting up ID dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        scraper = PhyscraperScrape(data_obj, ids)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        # run the analyses
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.run_blast_wrapper(delay=14)
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    return 1
Ejemplo n.º 19
0
def test_sp_seq_d():

    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]

    gi_sp_d = []
    sp_d = filteredScrape.make_sp_dict()
    for key in sp_d:
        v = sp_d[key]
        for v2 in v:
            v2 = filteredScrape.data.otu_dict[v2]
            if '^physcraper:status' in v2:
                not_added = ['deleted', 'subsequence,', 'not']
                if v2['^physcraper:status'].split(' ')[0] not in not_added:
                    if '^ncbi:gi' in v2:
                        gi_sp_d.append(v2['^ncbi:accession'])
    user_sp_d = []
    for v in filteredScrape.sp_d.values():
        for v2 in v:
            v2 = filteredScrape.data.otu_dict[v2]
            if '^physcraper:status' in v2 or u'^physcraper:status' in v2:
                if v2['^physcraper:status'].split(
                        ' ')[0] not in filteredScrape.seq_filter:
                    if v2['^physcraper:last_blasted'] != '1800/01/01':
                        if '^user:TaxonName' in v2:
                            user_sp_d.append(v2['^user:TaxonName'])
                        elif '^ot:ottTaxonName' in v2:
                            user_sp_d.append(v2['^ot:ottTaxonName'])
    filteredScrape.make_sp_seq_dict()
    gi_sp_seq_d = []
    ott_sp_seq_d = []
    for v in filteredScrape.sp_seq_d.values():
        for k in v.keys():
            # print(k)
            if len(k.split('.')) >= 2:
                # if type(k) == int:
                gi_sp_seq_d.append(k)
            else:
                # if type(k) == str or type(k) == unicode:
                ott_sp_seq_d.append(k)
    # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d))
    assert len(ott_sp_seq_d) == len(user_sp_d)
    assert len(gi_sp_seq_d) == len(gi_sp_d)
Ejemplo n.º 20
0
def test_add_local():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    if os.path.exists(otu_jsonfi_local):
        otu_json_local = json.load(open(otu_jsonfi_local))
    else:
        otu_json_local = OtuJsonDict(id_to_spn_addseq, ids)
        json.dump(otu_json_local, open(otu_jsonfi_local, "w"))

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist


    if add_local_seq is not None:
        filteredScrape.unpublished = True
    if filteredScrape.unpublished is True:  # use unpublished data
        # filteredScrape.unpublished = True
        filteredScrape.data.unpubl_otu_json = otu_json_local
        filteredScrape.write_unpubl_blastdb(add_local_seq)

        # filteredScrape.make_otu_dict_entry_unpubl()
        filteredScrape.run_blast_wrapper()
        filteredScrape.read_blast_wrapper()
        filteredScrape.remove_identical_seqs()

    test = False
    for key in filteredScrape.data.otu_dict.keys():
        if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys():
            if filteredScrape.data.otu_dict[key]['^ncbi:title'] == "unpublished":
                test = True
                break
    assert test == True
Ejemplo n.º 21
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """looks for a json file to continue run, or builds and runs
    new analysis for as long as new seqs are found

    This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life.
    You need:
         seqaln = ID of alignment file
         mattype = the format name of you alignment
         trfn = Id of phylogeny to update
         workdir = define where your analysis files shall be stored
         configfi = path to your config file
         ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest

         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    debug("Debugging mode is on")

    conf = ConfigObj(configfi, interactive=False)
    if os.path.isfile("{}/att_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading data object from pickle file\n")
        data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb"))
#        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_phylesystem(aln=aln,
                                                 workdir=workdir,
                                                 study_id=study_id,
                                                 tree_id=tree_id,
                                                 phylesystem_loc=conf.phylesystem_loc,
                                                 ingroup_mrca=ingroup_mrca)
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
        # ids = IdDicts(conf, workdir="example")
        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
    if os.path.isfile(conf.id_pickle):
        sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle))
        ids = pickle.load(open(conf.id_pickle, "rb"))
    else:
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        ids.dump()
    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    scraper = PhyscraperScrape(data_obj, ids)
    # run the analyses
    if shared_blast_folder:
        scraper.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    scraper.run_blast_wrapper(delay=14)
    scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
    scraper.remove_identical_seqs()
    scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    # scraper.write_otu_info()

    return scraper