Ejemplo n.º 1
0
def test_run_raxml():

    workdir = "tests/output/test_run_raxml"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)

    #load data
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    scraper = PhyscraperScrape(data_obj, ids)
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    scraper._blasted = 1

    # run needed functions
    # scraper.run_blast_wrapper()
    scraper.read_blast_wrapper(blast_dir=blast_dir)

    #	scraper.align_query_seqs()

    #	scraper.place_query_seqs()

    scraper.est_full_tree()
    # scraper.generate_streamed_alignment()
    assert os.path.exists("{}/RAxML_bestTree.{}".format(
        scraper.workdir, scraper.date))
Ejemplo n.º 2
0
def test_no_mrca():
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    workdir = "tests/output/test_mrcalist_local"
    configfi = "tests/data/test.config"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    ingroup_mrca = None
    # setup the run
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    # print(ids.mrca_ott, ids.mrca_ncbi)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    filteredScrape = PhyscraperScrape(data_obj, ids, ingroup_mrca)
    filteredScrape.threshold = 5
    assert filteredScrape.mrca_ncbi == 18794
    
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    filteredScrape._blasted = 1
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    assert len(filteredScrape.new_seqs_otu_id) in [23,17] #Blurghhh, local vs remote searches get diffenrt number of seqs!
Ejemplo n.º 3
0
def test_remove_identical_seqs():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    print("start")
    scraper = PhyscraperScrape(data_obj, ids)
    scraper.config.blast_loc = 'remote'
    scraper.ids.otu_rank = {}
    scraper.config.gifilename = False

    scraper._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper.read_blast_wrapper(blast_dir=blast_dir)

    a = len(scraper.new_seqs) == 40
    b = len(scraper.data.aln) == 5
    c = len(scraper.new_seqs_otu_id) == 0

    scraper.remove_identical_seqs()

    d = len(scraper.new_seqs) == 40
    e = len(scraper.data.aln) == 5
    f = len(scraper.new_seqs_otu_id) == 38
    g = 1
    for taxon in scraper.data.tre.taxon_namespace:
        h = taxon.label in scraper.data.otu_dict
        g = g * h
        status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
        i = status in ('original', 'query')
        g = g * i

    # Second test checks that seq len prec is affecting results
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p",
                                'rb'))  #reload bc data object is mutable
    data_obj.workdir = absworkdir
    scraper2 = PhyscraperScrape(data_obj, ids)
    scraper2.config.blast_loc = 'remote'
    scraper2.ids.otu_rank = {}

    scraper2.config.gifilename = False
    j = len(scraper2.data.aln) == 5
    # scraper2.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper2.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    scraper2.config.seq_len_perc = 0.998  # Change seq len percentage from default of 75%

    k = len(scraper2.new_seqs) == 40
    l = len(scraper2.new_seqs_otu_id) == 0

    scraper2.remove_identical_seqs()
    # print(scraper2.data.otu_dict)
    # print(len(scraper.new_seqs_otu_id), 38)
    # print(len(scraper2.new_seqs_otu_id), 36)
    m = len(scraper2.new_seqs_otu_id) == 36
    count = 0
    assert a * b * c * d * e * f * g * h * i * j * k * l * m == True
Ejemplo n.º 4
0
def test_sp_seq_d():

    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]

    gi_sp_d = []
    sp_d = filteredScrape.make_sp_dict()
    for key in sp_d:
        v = sp_d[key]
        for v2 in v:
            v2 = filteredScrape.data.otu_dict[v2]
            if '^physcraper:status' in v2:
                not_added = ['deleted', 'subsequence,', 'not']
                if v2['^physcraper:status'].split(' ')[0] not in not_added:
                    if '^ncbi:gi' in v2:
                        gi_sp_d.append(v2['^ncbi:accession'])
    user_sp_d = []
    for v in filteredScrape.sp_d.values():
        for v2 in v:
            v2 = filteredScrape.data.otu_dict[v2]
            if '^physcraper:status' in v2 or u'^physcraper:status' in v2:
                if v2['^physcraper:status'].split(
                        ' ')[0] not in filteredScrape.seq_filter:
                    if v2['^physcraper:last_blasted'] != '1800/01/01':
                        if '^user:TaxonName' in v2:
                            user_sp_d.append(v2['^user:TaxonName'])
                        elif '^ot:ottTaxonName' in v2:
                            user_sp_d.append(v2['^ot:ottTaxonName'])
    filteredScrape.make_sp_seq_dict()
    gi_sp_seq_d = []
    ott_sp_seq_d = []
    for v in filteredScrape.sp_seq_d.values():
        for k in v.keys():
            # print(k)
            if len(k.split('.')) >= 2:
                # if type(k) == int:
                gi_sp_seq_d.append(k)
            else:
                # if type(k) == str or type(k) == unicode:
                ott_sp_seq_d.append(k)
    # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d))
    assert len(ott_sp_seq_d) == len(user_sp_d)
    assert len(gi_sp_seq_d) == len(gi_sp_d)
Ejemplo n.º 5
0
def test_read_local_blast():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    # print("prepare test")
    for taxonID in filteredScrape.sp_d:
        if len(filteredScrape.sp_seq_d[taxonID]) > treshold:
            # print(taxonID)
            blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0]
            seq = filteredScrape.sp_seq_d[taxonID][blast_seq]
            local_blast.write_filterblast_query(filteredScrape.workdir,
                                                taxonID,
                                                seq,
                                                fn=str(taxonID))
            # print(filteredScrape.sp_seq_d[taxonID].keys()[1:] )
            blast_db = [
                item for item in filteredScrape.sp_seq_d[taxonID].keys()[1:]
                if len(item.split(".")) >= 2
            ]
            # print(blast_db)
            for blast_key in blast_db:
                seq = filteredScrape.sp_seq_d[taxonID][blast_key]
                local_blast.write_filterblast_db(filteredScrape.workdir,
                                                 blast_key,
                                                 seq,
                                                 fn=str(taxonID))
            break

    # print(taxonID)

    blast_db = taxonID
    blast_seq = taxonID
    key = taxonID

    local_blast.run_filter_blast(filteredScrape.workdir, blast_seq, blast_db)
    local_blast.read_filter_blast(filteredScrape.workdir,
                                  filteredScrape.sp_seq_d[key], blast_db)

    blast_out = "{}/blast/output_{}_tobeblasted.xml".format(workdir, key)

    if os.path.exists(blast_out):
        with open(blast_out) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
def test_remove_identical_seqs():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir

    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    # print("start")
    scraper = PhyscraperScrape(data_obj, ids)
    scraper.ids.otu_rank = {}
    scraper.config.gifilename = False
    scraper._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper.read_blast_wrapper(blast_dir=blast_dir)
    #print scraper.ncbi_mrca

    assert (len(scraper.new_seqs) == 0)
    assert (len(scraper.data.aln) == 5)
    assert len(scraper.new_seqs_otu_id) == 17
    #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test.

    #TODO find an example where we do get identical sequences and need to discard them

    #    seqset = set()
    #    for otu in scraper.new_seqs_otu_id:
    #        seq = scraper.new_seqs_otu_id[otu]
    #        if seq in seqset:
    #            print otu
    #        seqset.add(seq)

    #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence.
    ##    for otu in scraper.new_seqs_otu_id:
    #       qseq = scraper.new_seqs_otu_id[otu]
    #       count = 0
    #       for seq in seqset:
    #           if qseq in seq:
    #               count += 1
    #       assert count == 1

    ##    for taxon in scraper.data.tre.taxon_namespace:
    #       assert(taxon.label in scraper.data.otu_dict)
    #       status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
    #       assert(status in ('original', 'query'))

    aln_path1 = scraper.data.write_aln()
    aln_path = scraper.write_all_unaligned('test.fas')
    scraper.align_query_seqs()
    assert len(scraper.data.aln) == 22
Ejemplo n.º 7
0
def test_write_outputinfo():
    workdir = "tests/output/test_write_output_files"
    configfi = "tests/data/test.config"
    downtorank = None
    absworkdir = os.path.abspath(workdir)

    fn_otu = os.path.join(absworkdir, "otu_seq_info.csv")
    fn_sampling = os.path.join(absworkdir, "taxon_sampling.csv")

    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    # filteredScrape.sp_dict(downtorank)
    # filteredScrape.make_sp_seq_dict()

    filteredScrape.align_query_seqs()

    wrappers.write_out_files(filteredScrape, downtorank)

    with open(fn_otu) as fn:
        line = fn.readline()
        cnt = 1
        while cnt <= 5:
            line = fn.readline()
            cnt += 1
            assert type(line) == str
            assert line.split(",") >= 2

    with open(fn_sampling) as fn:
        line = fn.readline()
        cnt = 1
        while cnt <= 5:
            line = fn.readline()
            cnt += 1
            assert type(line) == str
            assert line.split(",") >= 2
Ejemplo n.º 8
0
def test_sp_d():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
Ejemplo n.º 9
0
def test_add_all():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    filteredScrape.threshold = threshold
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    filteredScrape.remove_identical_seqs()
    sp_d = filteredScrape.make_sp_dict(filteredScrape.new_seqs_otu_id)
    assert len(sp_d) == 5
    for taxon in sp_d:
        assert len(sp_d[taxon]) <= threshold
Ejemplo n.º 10
0
ids = IdDicts(conf, workdir=workdir)

otu_json = OtuJsonDict(id_to_spn, ids)
with open(otu_jsonfi, "w") as outfile:
    json.dump(otu_json, outfile)

ottids = [otu_json[ite]['^ot:ottId'] for ite in otu_json]
mrca = opentree_helpers.get_mrca_ott(ottids)

data_obj = generate_ATT_from_files(seqaln=seqaln,
                                   mattype=mattype,
                                   workdir=workdir,
                                   config_obj=conf,
                                   treefile=trfn,
                                   schema_trf=schema_trf,
                                   otu_json=otu_jsonfi,
                                   ingroup_mrca=mrca)

data_obj.prune_short()
data_obj.dump(filename="tests/data/precooked/tiny_dataobj.p")

scraper = PhyscraperScrape(data_obj, ids)

scraper._blasted = 1
scraper.read_blast_wrapper(
    blast_dir="tests/data/precooked/fixed/tte_blast_files")

pickle.dump(ids.acc_ncbi_dict, open("tests/data/precooked/tiny_acc_map.p",
                                    "wb"))
# pickle.dump(scraper.acc_list_mrca, open("tests/data/precooked/acc_list_mrca.p", "wb"))
Ejemplo n.º 11
0
def test_internal_mpi():
    import pickle
    import sys
    import os
    import subprocess
    from physcraper import ConfigObj, PhyscraperScrape, IdDicts
    from mpi4py import MPI

    # set up until test
    workdir = "tests/output/test_mpi_raxml"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)

    #load data
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    scraper = PhyscraperScrape(data_obj, ids)
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    scraper._blasted = 1

    # run needed functions
    scraper.read_blast_wrapper(blast_dir=blast_dir)
    scraper.remove_identical_seqs()

    scraper.data.write_papara_files()
    scraper.align_query_seqs()
    scraper.place_query_seqs()
    scraper.est_full_tree()

    # scraper.generate_streamed_alignment()
    assert os.path.exists("{}/RAxML_bestTree.{}".format(
        scraper.workdir, scraper.date))
    # scraper.generate_streamed_alignment()
    if not os.path.exists("{}/previous_run".format(scraper.workdir)):
        os.mkdir("{}/previous_run".format(scraper.workdir))
    os.system(
        "mv {}/papara_alignment.extended  {}/previous_run/papara_alignment.extended"
        .format(scraper.workdir, scraper.workdir))

    cwd = os.getcwd()
    # os.chdir(scraper.workdir)

    ntasks = os.environ.get('SLURM_NTASKS_PER_NODE')
    nnodes = os.environ.get("SLURM_JOB_NUM_NODES")
    print(nnodes, ntasks)
    env_var = int(nnodes) * int(ntasks)
    #env_var = os.environ.get('SLURM_JOB_CPUS_PER_NODE', 7)
    print(env_var)

    assert os.path.exists("{}/previous_run/papara_alignment.extended".format(
        scraper.workdir))
    with cd(scraper.workdir):
        print("run with mpi")
        subprocess.call([
            "mpiexec", "-n", "{}".format(env_var), "raxmlHPC-MPI-AVX2", "-m",
            "GTRCAT", "-s", "{}/previous_run/papara_alignment.extended".format(
                scraper.workdir), "-p", "1", "-f", "a", "-x", "1", "-#",
            "autoMRE", "-n", "all{}".format(scraper.date)
        ])
Ejemplo n.º 12
0
def test_remove_id_seq():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1

    #############################

    id_seq = [
        "TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC",
        "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC",
        "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC",
        "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC"
    ]

    # print("start test")
    tmp_dict = dict(
        (taxon.label, filteredScrape.data.aln[taxon].symbols_as_string())
        for taxon in filteredScrape.data.aln)
    old_seqs = tmp_dict.keys()
    avg_seqlen = sum(filteredScrape.data.orig_seqlen) / len(
        filteredScrape.data.orig_seqlen)
    assert filteredScrape.config.seq_len_perc <= 1
    seq_len_cutoff = avg_seqlen * filteredScrape.config.seq_len_perc
    count = 1

    for item in id_seq:
        if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff:
            ott = "OTT_{}".format(count)
            count += 1
            otu_id = ott
            filteredScrape.data.otu_dict[otu_id] = {}
            filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300
            filteredScrape.data.otu_dict[otu_id][
                '^ncbi:accession'] = "KX494441"
            filteredScrape.data.otu_dict[otu_id][
                '^ncbi:title'] = "some random title"
            filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101
            filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott
            filteredScrape.data.otu_dict[otu_id][
                '^physcraper:status'] = "query"
            filteredScrape.data.otu_dict[otu_id][
                '^ot:ottTaxonName'] = "Senecio vulgaris"
            filteredScrape.data.otu_dict[otu_id][
                '^physcraper:last_blasted'] = None
            filteredScrape.del_superseq = set()
            filteredScrape.seq_dict_build(item, otu_id, tmp_dict)
    for tax in old_seqs:
        try:
            del tmp_dict[tax]
        except KeyError:
            pass
    filteredScrape.new_seqs_otu_id = tmp_dict
    expected_add = 1
    assert expected_add == len(filteredScrape.new_seqs_otu_id)
    sys.stdout.write(
        "todo: add check that newly added seq are checked. they are, but there is no test"
    )
Ejemplo n.º 13
0
def test_blacklist():

    workdir = "tests/output/test_blacklist"
    configfi = "tests/data/test.config"

    # make one run without blacklist
    blacklist = None
    noblack = os.path.join(workdir, "noblacklist")
    absworkdir = os.path.abspath(noblack)
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))

    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    noblackScrape = PhyscraperScrape(data_obj, ids)
    noblackScrape._blasted = 1
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        # print(dest)
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    noblackScrape.read_blast_wrapper()
    noblackScrape.remove_identical_seqs()
    new_test_generate_streamed_aln(noblackScrape)

    # one run with blacklist

    blacklist = ['JX895340.1']
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist
    filteredScrape._blasted = 1
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper()
    filteredScrape.remove_identical_seqs()
    new_test_generate_streamed_aln(filteredScrape)


    print("RUN TESTS!")
    gi_l = []
    gi_l_2 = []
    for tax in filteredScrape.data.tre.taxon_namespace:
        gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l.append(gi_id)
    print(gi_l)
    for tax in noblackScrape.data.tre.taxon_namespace:
        # print(filteredScrape.data.otu_dict[tax.label])
        gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l_2.append(gi_id)
    print(gi_l_2)
    for item in blacklist:
        assert item not in gi_l