Beispiel #1
0
def test_run_raxml():

    workdir = "tests/output/test_run_raxml"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)

    #load data
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    scraper = PhyscraperScrape(data_obj, ids)
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    scraper._blasted = 1

    # run needed functions
    # scraper.run_blast_wrapper()
    scraper.read_blast_wrapper(blast_dir=blast_dir)

    #	scraper.align_query_seqs()

    #	scraper.place_query_seqs()

    scraper.est_full_tree()
    # scraper.generate_streamed_alignment()
    assert os.path.exists("{}/RAxML_bestTree.{}".format(
        scraper.workdir, scraper.date))
Beispiel #2
0
def test_remove_identical_seqs():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    print("start")
    scraper = PhyscraperScrape(data_obj, ids)
    scraper.config.blast_loc = 'remote'
    scraper.ids.otu_rank = {}
    scraper.config.gifilename = False

    scraper._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper.read_blast_wrapper(blast_dir=blast_dir)

    a = len(scraper.new_seqs) == 40
    b = len(scraper.data.aln) == 5
    c = len(scraper.new_seqs_otu_id) == 0

    scraper.remove_identical_seqs()

    d = len(scraper.new_seqs) == 40
    e = len(scraper.data.aln) == 5
    f = len(scraper.new_seqs_otu_id) == 38
    g = 1
    for taxon in scraper.data.tre.taxon_namespace:
        h = taxon.label in scraper.data.otu_dict
        g = g * h
        status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
        i = status in ('original', 'query')
        g = g * i

    # Second test checks that seq len prec is affecting results
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p",
                                'rb'))  #reload bc data object is mutable
    data_obj.workdir = absworkdir
    scraper2 = PhyscraperScrape(data_obj, ids)
    scraper2.config.blast_loc = 'remote'
    scraper2.ids.otu_rank = {}

    scraper2.config.gifilename = False
    j = len(scraper2.data.aln) == 5
    # scraper2.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper2.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    scraper2.config.seq_len_perc = 0.998  # Change seq len percentage from default of 75%

    k = len(scraper2.new_seqs) == 40
    l = len(scraper2.new_seqs_otu_id) == 0

    scraper2.remove_identical_seqs()
    # print(scraper2.data.otu_dict)
    # print(len(scraper.new_seqs_otu_id), 38)
    # print(len(scraper2.new_seqs_otu_id), 36)
    m = len(scraper2.new_seqs_otu_id) == 36
    count = 0
    assert a * b * c * d * e * f * g * h * i * j * k * l * m == True
Beispiel #3
0
def test_remove_taxa_aln_tre():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)

    len_aln_before = len(filteredScrape.data.aln.as_string('phylip'))
    len_tre_before = len(filteredScrape.data.tre.as_string(schema="newick"))
    namespace_before = len(filteredScrape.data.aln.taxon_namespace)
    namespace_tre_before = len(filteredScrape.data.tre.taxon_namespace)

    for tax in filteredScrape.data.aln.taxon_namespace:
        filteredScrape.data.remove_taxa_aln_tre(tax.label)
        break

    len_aln_after = len(filteredScrape.data.aln.as_string('phylip'))
    len_tre_after = len(filteredScrape.data.tre.as_string(schema="newick"))
    namespace_after = len(filteredScrape.data.aln.taxon_namespace)
    namespace_tre_after = len(filteredScrape.data.tre.taxon_namespace)

    assert len_aln_before != len_aln_after
    assert len_tre_before != len_tre_after
    assert namespace_before != namespace_after
    assert namespace_tre_before != namespace_tre_after
Beispiel #4
0
def test_add_all():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    for key in filteredScrape.sp_d:
        if len(filteredScrape.sp_d[key]) <= treshold:
            filteredScrape.add_all(key)
    treshold_undermin = 0
    for key in filteredScrape.sp_d:
        for key2 in filteredScrape.sp_d[key]:
            if len(filteredScrape.sp_d[key]) <= treshold:
                if '^physcraper:status' in key2:
                    if key2['^physcraper:status'].split(
                            ' ')[0] not in filteredScrape.seq_filter:
                        if key2['^physcraper:last_blasted'] == '1800/01/01':
                            treshold_undermin += 1
    add_all_thresholdmin = filteredScrape.filtered_seq
    assert treshold_undermin == len(add_all_thresholdmin)
Beispiel #5
0
def test_sp_seq_d():

    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]

    gi_sp_d = []
    sp_d = filteredScrape.make_sp_dict()
    for key in sp_d:
        v = sp_d[key]
        for v2 in v:
            v2 = filteredScrape.data.otu_dict[v2]
            if '^physcraper:status' in v2:
                not_added = ['deleted', 'subsequence,', 'not']
                if v2['^physcraper:status'].split(' ')[0] not in not_added:
                    if '^ncbi:gi' in v2:
                        gi_sp_d.append(v2['^ncbi:accession'])
    user_sp_d = []
    for v in filteredScrape.sp_d.values():
        for v2 in v:
            v2 = filteredScrape.data.otu_dict[v2]
            if '^physcraper:status' in v2 or u'^physcraper:status' in v2:
                if v2['^physcraper:status'].split(
                        ' ')[0] not in filteredScrape.seq_filter:
                    if v2['^physcraper:last_blasted'] != '1800/01/01':
                        if '^user:TaxonName' in v2:
                            user_sp_d.append(v2['^user:TaxonName'])
                        elif '^ot:ottTaxonName' in v2:
                            user_sp_d.append(v2['^ot:ottTaxonName'])
    filteredScrape.make_sp_seq_dict()
    gi_sp_seq_d = []
    ott_sp_seq_d = []
    for v in filteredScrape.sp_seq_d.values():
        for k in v.keys():
            # print(k)
            if len(k.split('.')) >= 2:
                # if type(k) == int:
                gi_sp_seq_d.append(k)
            else:
                # if type(k) == str or type(k) == unicode:
                ott_sp_seq_d.append(k)
    # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d))
    assert len(ott_sp_seq_d) == len(user_sp_d)
    assert len(gi_sp_seq_d) == len(gi_sp_d)
Beispiel #6
0
def test_add_local():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    if os.path.exists(otu_jsonfi_local):
        otu_json_local = json.load(open(otu_jsonfi_local))
    else:
        otu_json_local = OtuJsonDict(id_to_spn_addseq, ids)
        json.dump(otu_json_local, open(otu_jsonfi_local, "w"))

    sys.stdout.write("\ntest addLocal\n")

    # Prune sequences below a certain length threshold
    data_obj.prune_short()
    data_obj.write_files()
    data_obj.write_labelled(label='^ot:ottTaxonName', add_gb_id=True)
    data_obj.write_otus("otu_info", schema='table')
    data_obj.dump()

    sys.stdout.write("setting up id dictionaries\n")
    sys.stdout.flush()

    ids = IdDicts(conf, workdir=workdir)

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape.blacklist = blacklist

    if add_local_seq is not None:
        filteredScrape.unpublished = True
    if filteredScrape.unpublished is True:  # use unpublished data
        # filteredScrape.unpublished = True
        filteredScrape.data.unpubl_otu_json = otu_json_local
        filteredScrape.write_unpubl_blastdb(add_local_seq)

        # filteredScrape.make_otu_dict_entry_unpubl()
        filteredScrape.run_blast_wrapper()
        filteredScrape.read_blast_wrapper()
        filteredScrape.remove_identical_seqs()

    test = False
    for key in filteredScrape.data.otu_dict.keys():
        if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys():
            if filteredScrape.data.otu_dict[key][
                    '^ncbi:title'] == "unpublished":
                test = True
                break
    assert test == True
Beispiel #7
0
def test_remove_id_seq():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape =  FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1

    #############################

    id_seq = ["TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC"
                ]

    # print("start test")
    tmp_dict = dict((taxon.label, filteredScrape.data.aln[taxon].symbols_as_string()) for taxon in filteredScrape.data.aln)
    old_seqs = tmp_dict.keys()
    avg_seqlen = sum(filteredScrape.data.orig_seqlen)/len(filteredScrape.data.orig_seqlen)
    assert filteredScrape.config.seq_len_perc <= 1
    seq_len_cutoff = avg_seqlen*filteredScrape.config.seq_len_perc
    count=1

    for item in id_seq:
        if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff:
            ott = "OTT_{}".format(count)
            count += 1
            otu_id = ott
            filteredScrape.data.otu_dict[otu_id] = {}
            filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300
            filteredScrape.data.otu_dict[otu_id]['^ncbi:accession'] =   "KX494441"
            filteredScrape.data.otu_dict[otu_id]['^ncbi:title'] = "some random title"
            filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101
            filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott
            filteredScrape.data.otu_dict[otu_id]['^physcraper:status'] = "query"
            filteredScrape.data.otu_dict[otu_id]['^ot:ottTaxonName'] = "Senecio vulgaris"
            filteredScrape.data.otu_dict[otu_id]['^physcraper:last_blasted'] = "1800/01/01"
            filteredScrape.seq_dict_build(item, otu_id, tmp_dict)
    for tax in old_seqs:
        try:
            del tmp_dict[tax]
        except KeyError:
            pass
    filteredScrape.new_seqs_otu_id = tmp_dict
    expected_add = 1
    assert expected_add == len(filteredScrape.new_seqs_otu_id)
    sys.stdout.write("todo: add check that newly added seq are checked. they are, but there is no test")
def test_remove_identical_seqs():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir

    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    # print("start")
    scraper = PhyscraperScrape(data_obj, ids)
    scraper.ids.otu_rank = {}
    scraper.config.gifilename = False
    scraper._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    #scraper.gi_list_mrca = pickle.load(open("tests/data/precooked/gi_list_mrca.p", 'rb'))
    scraper.read_blast_wrapper(blast_dir=blast_dir)
    #print scraper.ncbi_mrca

    assert (len(scraper.new_seqs) == 0)
    assert (len(scraper.data.aln) == 5)
    assert len(scraper.new_seqs_otu_id) == 17
    #Now that we are pulling the full remote sequences, we don'thave any identical seuqnces in the test.

    #TODO find an example where we do get identical sequences and need to discard them

    #    seqset = set()
    #    for otu in scraper.new_seqs_otu_id:
    #        seq = scraper.new_seqs_otu_id[otu]
    #        if seq in seqset:
    #            print otu
    #        seqset.add(seq)

    #check that every new sequence is unique in the new seqs set, and is not a substring of another sequence.
    ##    for otu in scraper.new_seqs_otu_id:
    #       qseq = scraper.new_seqs_otu_id[otu]
    #       count = 0
    #       for seq in seqset:
    #           if qseq in seq:
    #               count += 1
    #       assert count == 1

    ##    for taxon in scraper.data.tre.taxon_namespace:
    #       assert(taxon.label in scraper.data.otu_dict)
    #       status = scraper.data.otu_dict[taxon.label].get(u'^physcraper:status')
    #       assert(status in ('original', 'query'))

    aln_path1 = scraper.data.write_aln()
    aln_path = scraper.write_all_unaligned('test.fas')
    scraper.align_query_seqs()
    assert len(scraper.data.aln) == 22
Beispiel #9
0
def test_id_dicts():
    conf = ConfigObj(configfi, interactive=True)
    ids = IdDicts(conf, workdir=workdir)
    selection = random.sample(ids.ott_to_ncbi.keys(), 10)
    for ott_id in selection:
        ncbi_id = ids.ott_to_ncbi[ott_id]
        assert ids.ncbi_to_ott[ncbi_id] == ott_id
Beispiel #10
0
def test_no_mrca():
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    workdir = "tests/output/test_mrcalist_local"
    configfi = "tests/data/test.config"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    ingroup_mrca = None
    # setup the run
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    # print(ids.mrca_ott, ids.mrca_ncbi)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    filteredScrape = PhyscraperScrape(data_obj, ids, ingroup_mrca)
    filteredScrape.threshold = 5
    assert filteredScrape.mrca_ncbi == 18794
    
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    filteredScrape._blasted = 1
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    assert len(filteredScrape.new_seqs_otu_id) in [23,17] #Blurghhh, local vs remote searches get diffenrt number of seqs!
Beispiel #11
0
def test_species_translation():
    spn = "Mephitis mephitis"
    info = get_ott_taxon_info(spn)
    if info:
        ottid, ottname, ncbi_id = info
    a = ottid == 231602

    tree_of_life.mrca(ott_ids=[ottid], wrap_response=False)

    ott_ids = [770315, 158484]
    ott_mrca = get_mrca_ott(ott_ids)
    b = ott_mrca == 312031

    workdir = "tests/output/tmp"
    configfi = "tests/data/test.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)
    """Tests if your own input files will generate a data object of class AlignTreeTax
	"""

    conf = ConfigObj(configfi, interactive=False)
    ids = IdDicts(conf, workdir=workdir)

    otu_json = OtuJsonDict(id_to_spn, ids)

    c = otu_json == expected_json
    assert a * b * c == 1
Beispiel #12
0
def test_owndata():
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    workdir = "tests/output/owndata"
    configfi = "tests/data/localblast.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)
    """Tests if your own input files will generate a data object of class AlignTreeTax
	"""

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=trfn,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    assert isinstance(data_obj, AlignTreeTax)
Beispiel #13
0
def test():

    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    workdir = "tests/output/test_own_local"
    configfi = "tests/data/test.config"
    # configfi = "tests/data/aws.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    wrappers.own_data_run(seqaln, mattype, trfn, schema_trf, workdir,
                          otu_jsonfi, configfi)
Beispiel #14
0
def load_ids_obj(conf, workdir):
    """
    Generates the IdDict class object.

    :param conf: Config Object of physcraper class
    :param workdir: working directory
    :return:
    """
    if os.path.isfile("{}/id_pickle.p".format(workdir)):
        sys.stdout.write("Reloading id dicts from {}\n".format(workdir))
        ids = pickle.load(open("{}/id_pickle.p".format(workdir), "rb"))
    else:
        sys.stdout.write("setting up ID dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, "{}/id_pickle.p".format(workdir))
        ids.dump("{}/id_pickle.p".format(workdir))
    return ids
Beispiel #15
0
def test_sp_d():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
Beispiel #16
0
def test_write_blast():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    for taxonID in filteredScrape.sp_d:
        if len(filteredScrape.sp_seq_d[taxonID]) > treshold:
            blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0]
            seq = filteredScrape.sp_seq_d[taxonID][blast_seq]
            local_blast.write_filterblast_files(workdir, taxonID, seq)
            blast_db = filteredScrape.sp_seq_d[taxonID].keys()[1:]
            for blast_key in blast_db:
                seq = filteredScrape.sp_seq_d[taxonID][blast_key]
                local_blast.write_filterblast_files(workdir,
                                                    blast_key,
                                                    seq,
                                                    db=True,
                                                    fn=str(taxonID))
            break

    blast_file_blast = "{}/blast/{}_tobeblasted".format(workdir, taxonID)
    # print(blast_file_blast)
    blast_file_db = "{}/blast/{}_db".format(workdir, taxonID)
    # print(blast_file_db, blast_file_blast)
    if os.path.exists(blast_file_blast):
        with open(blast_file_blast) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
    if os.path.exists(blast_file_db):
        with open(blast_file_db) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
Beispiel #17
0
def test_sp_d():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    gi_data_otu_dict_added = []
    for v in filteredScrape.data.otu_dict.values():
        if '^ncbi:gi' in v:
            if (v['^physcraper:status'].split(' ')[0]
                    not in filteredScrape.seq_filter):
                gi_data_otu_dict_added.append(v['^ncbi:gi'])
    gi_sp_d = []
    for key in filteredScrape.sp_d:
        v = filteredScrape.sp_d[key]
        for v2 in v:
            if '^ncbi:gi' in v2:
                gi_sp_d.append(v2['^ncbi:gi'])
    user_data_otu_dict = []
    for v in filteredScrape.data.otu_dict.values():
        if '^user:TaxonName' in v:
            user_data_otu_dict.append(v['^user:TaxonName'])
    user_sp_d = []
    for v in filteredScrape.sp_d.values():
        for v2 in v:
            if '^user:TaxonName' in v2:
                user_sp_d.append(v2['^user:TaxonName'])
    assert sorted(gi_data_otu_dict_added) == sorted(gi_sp_d)
    assert sorted(user_data_otu_dict) == sorted(user_sp_d)
Beispiel #18
0
def test_filter_length():

    workdir = "tests/output/test_selectbylength"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)
    threshold = 2
    selectby = "length"
    downtorank = "species"
    add_unpubl_seq = None
    blacklist = None

    id_to_spn_addseq_json = None
    ingroup_mrca = None
    shared_blast_folder = None

    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist

    sys.stdout.write("BLASTing input sequences\n")
    if shared_blast_folder:
        filteredScrape.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    # filteredScrape.run_blast_wrapper()
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.remove_identical_seqs()
    filteredScrape.dump()
    sys.stdout.write("Filter the sequences\n")
    length_unfiltered = len(filteredScrape.new_seqs_otu_id)

    #    if threshold is not None:
    #        filteredScrape.filter_seqs()

    length_filtered = len(filteredScrape.new_seqs)
Beispiel #19
0
def test_add_all():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    filteredScrape.threshold = threshold
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    filteredScrape.remove_identical_seqs()
    sp_d = filteredScrape.make_sp_dict(filteredScrape.new_seqs_otu_id)
    assert len(sp_d) == 5
    for taxon in sp_d:
        assert len(sp_d[taxon]) <= threshold
Beispiel #20
0
def test_add_local():
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    if os.path.exists(otu_jsonfi_local):
        otu_json_local = json.load(open(otu_jsonfi_local))
    else:
        otu_json_local = OtuJsonDict(id_to_spn_addseq, ids)
        json.dump(otu_json_local, open(otu_jsonfi_local, "w"))

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist


    if add_local_seq is not None:
        filteredScrape.unpublished = True
    if filteredScrape.unpublished is True:  # use unpublished data
        # filteredScrape.unpublished = True
        filteredScrape.data.unpubl_otu_json = otu_json_local
        filteredScrape.write_unpubl_blastdb(add_local_seq)

        # filteredScrape.make_otu_dict_entry_unpubl()
        filteredScrape.run_blast_wrapper()
        filteredScrape.read_blast_wrapper()
        filteredScrape.remove_identical_seqs()

    test = False
    for key in filteredScrape.data.otu_dict.keys():
        if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys():
            if filteredScrape.data.otu_dict[key]['^ncbi:title'] == "unpublished":
                test = True
                break
    assert test == True
Beispiel #21
0
def test():
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi, interactive=False)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    # that's the main function
    wrappers.own_data_run(seqaln, mattype, trfn, schema_trf, workdir,
                          otu_jsonfi, configfi)
Beispiel #22
0
def test_trim():
  #------------------------
  seqaln= "tests/data/tiny_test_example/test_extralongseq.fas"
  mattype="fasta"
  treefile= "tests/data/tiny_test_example/test.tre"
  schema_trf = "newick"
  workdir="tests/output/test_trim"
  configfi = "tests/data/test.config"
  id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
  otu_jsonfi = "{}/otu_dict.json".format(workdir)



  if not os.path.exists("{}".format(workdir)):
          os.makedirs("{}".format(workdir))

  conf = ConfigObj(configfi, interactive=False)
  ids = IdDicts(conf, workdir=workdir)

  if os.path.exists(otu_jsonfi):
      print("load json")
      otu_json = json.load(open(otu_jsonfi))
  else:
      otu_json = OtuJsonDict(id_to_spn, ids)
      json.dump(otu_json, open(otu_jsonfi,"w"))


  data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                   mattype=mattype, 
                                   workdir=workdir,
                                   config_obj=conf,
                                   treefile=treefile,
                                   schema_trf = schema_trf,
                                   otu_json=otu_jsonfi,
                                   ingroup_mrca=None)

  for tax, seq in data_obj.aln.items():
  	len_start = len(seq)
  	next
  data_obj.trim()
  for tax, seq in data_obj.aln.items():
  	len_end = len(seq)

  assert len_start != len_end
Beispiel #23
0
def test():
    # define here your files
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    workdir = "tests/output/tiny_filter_own2"
    configfi = "tests/data/remote.config"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    # change to your filtering criteria
    threshold = 2
    selectby = "blast"
    downtorank = "species"
    ingroup_mrca = 723076

    # setup the run
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    # select a wrapper function, depending on what you want to do, see short tutorial:
    wrappers.filter_data_run(seqaln,
                             mattype,
                             trfn,
                             schema_trf,
                             workdir,
                             threshold,
                             otu_jsonfi,
                             configfi,
                             selectby=selectby,
                             downtorank=downtorank,
                             ingroup_mrca=ingroup_mrca)
Beispiel #24
0
def test():

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi, interactive=False)
    ids = IdDicts(conf, workdir=workdir)

    otu_json = OtuJsonDict(id_to_spn, ids)
    json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefile,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for tax, seq in data_obj.aln.items():
        len_start = len(seq)

    data_obj.trim()

    for tax, seq in data_obj.aln.items():
        len_end = len(seq)

    assert len_start == len_end

    for tax, seq in data_obj.aln.items():
        len_start = len(seq)

    data_obj.config.trim_perc = 0.5
    data_obj.trim()

    for tax, seq in data_obj.aln.items():
        len_end = len(seq)

    assert len_start > len_end
def test_compare_json():
    expected_json = {
        'otuSdoronicum': {
            '^ncbi:taxon': u'462523',
            '^ot:ottTaxonName': u'Senecio doronicum',
            '^ncbi:TaxonName': 'Senecio doronicum',
            '^physcraper:TaxonName': 'Senecio doronicum',
            '^physcraper:status': 'original',
            '^ot:ottId': 318436,
            '^user:TaxonName': 'Senecio_doronicum',
            '^ot:originalLabel': 'S_doronicum',
            '^physcraper:last_blasted': None
        },
        'otuSlagascanus': {
            '^ncbi:taxon': u'1268580',
            '^ot:ottTaxonName': u'Senecio lagascanus',
            '^ncbi:TaxonName': 'Senecio lagascanus',
            '^physcraper:TaxonName': 'Senecio lagascanus',
            '^physcraper:status': 'original',
            '^ot:ottId': 640718,
            '^user:TaxonName': 'Senecio_lagascanus',
            '^ot:originalLabel': 'S_lagascanus',
            '^physcraper:last_blasted': None
        },
        'otu2029doronicum': {
            '^ncbi:taxon': u'462523',
            '^ot:ottTaxonName': u'Senecio doronicum',
            '^ncbi:TaxonName': 'Senecio doronicum',
            '^physcraper:TaxonName': 'Senecio doronicum',
            '^physcraper:status': 'original',
            '^ot:ottId': 318436,
            '^user:TaxonName': 'Senecio_doronicum',
            '^ot:originalLabel': '2029_doronicum',
            '^physcraper:last_blasted': None
        },
        'otuSlopezii': {
            '^ncbi:taxon': u'1268581',
            '^ot:ottTaxonName': u'Senecio lopezii',
            '^ncbi:TaxonName': 'Senecio lopezii',
            '^physcraper:TaxonName': 'Senecio lopezii',
            '^physcraper:status': 'original',
            '^ot:ottId': 688688,
            '^user:TaxonName': 'Senecio_lopezii',
            '^ot:originalLabel': 'S_lopezii',
            '^physcraper:last_blasted': None
        },
        'otuSscopolii': {
            '^ncbi:taxon': u'1268589',
            '^ot:ottTaxonName': u'Senecio scopolii',
            '^ncbi:TaxonName': 'Senecio scopolii',
            '^physcraper:TaxonName': 'Senecio scopolii',
            '^physcraper:status': 'original',
            '^ot:ottId': 688671,
            '^user:TaxonName': 'Senecio_scopolii',
            '^ot:originalLabel': 'S_scopolii',
            '^physcraper:last_blasted': None
        }
    }

    workdir = "tests/output/tmp"
    configfi = "tests/data/test.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    conf = ConfigObj(configfi, interactive=False)
    ids = IdDicts(conf, workdir=workdir)

    otu_json = OtuJsonDict(id_to_spn, ids)

    print(otu_json)
    assert otu_json == expected_json
Beispiel #26
0
def test_internal_mpi():
    import pickle
    import sys
    import os
    import subprocess
    from physcraper import ConfigObj, PhyscraperScrape, IdDicts
    from mpi4py import MPI

    # set up until test
    workdir = "tests/output/test_mpi_raxml"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)

    #load data
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    scraper = PhyscraperScrape(data_obj, ids)
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    scraper._blasted = 1

    # run needed functions
    scraper.read_blast_wrapper(blast_dir=blast_dir)
    scraper.remove_identical_seqs()

    scraper.data.write_papara_files()
    scraper.align_query_seqs()
    scraper.place_query_seqs()
    scraper.est_full_tree()

    # scraper.generate_streamed_alignment()
    assert os.path.exists("{}/RAxML_bestTree.{}".format(
        scraper.workdir, scraper.date))
    # scraper.generate_streamed_alignment()
    if not os.path.exists("{}/previous_run".format(scraper.workdir)):
        os.mkdir("{}/previous_run".format(scraper.workdir))
    os.system(
        "mv {}/papara_alignment.extended  {}/previous_run/papara_alignment.extended"
        .format(scraper.workdir, scraper.workdir))

    cwd = os.getcwd()
    # os.chdir(scraper.workdir)

    ntasks = os.environ.get('SLURM_NTASKS_PER_NODE')
    nnodes = os.environ.get("SLURM_JOB_NUM_NODES")
    print(nnodes, ntasks)
    env_var = int(nnodes) * int(ntasks)
    #env_var = os.environ.get('SLURM_JOB_CPUS_PER_NODE', 7)
    print(env_var)

    assert os.path.exists("{}/previous_run/papara_alignment.extended".format(
        scraper.workdir))
    with cd(scraper.workdir):
        print("run with mpi")
        subprocess.call([
            "mpiexec", "-n", "{}".format(env_var), "raxmlHPC-MPI-AVX2", "-m",
            "GTRCAT", "-s", "{}/previous_run/papara_alignment.extended".format(
                scraper.workdir), "-p", "1", "-f", "a", "-x", "1", "-#",
            "autoMRE", "-n", "all{}".format(scraper.date)
        ])
def test_loop_for_write_blast_files():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape.add_setting_to_self(downtorank, threshold)
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    # this is the code of the first part of how many seq to keep. if threshold is bigger than number of seq for sp, just add all
    # print("run loop which we want to test")
    for key in filteredScrape.sp_d:
        if len(filteredScrape.sp_d[key]) > threshold:
            count_dict = filteredScrape.count_num_seq(key)
            if key in filteredScrape.sp_seq_d.keys():
                seq_present = count_dict["seq_present"]
                query_count = count_dict["query_count"]
                if seq_present >= 1 and seq_present < threshold and count_dict["new_taxon"] is False and query_count != 0:
                    if query_count + seq_present > threshold:
                        taxonfn = filteredScrape.loop_for_write_blast_files(key)
                                    
# MAKE TEST FOR loop_for_write_blast_files


    for key in filteredScrape.sp_d:
        count = 0
        count_int = 0
        count_gi_file = 0
        count_str_file = 0
        db = False
        blasted = False
        if len(filteredScrape.sp_d[key]) > threshold:
            for sp_keys in filteredScrape.sp_seq_d[key].keys():
                if isinstance(sp_keys, str):
                    count += 1
                if isinstance(sp_keys, unicode):
                    count += 1
                else:
                    count_int += 1
            folder = '{}/blast/'.format(filteredScrape.workdir)
            for the_file in os.listdir(folder):
                spn = the_file.split("_")[0]
                spn = "_".join(the_file.split("_")[0])
                file_type = the_file.split("_")[1]
                if spn == key and file_type == "db": # 
                    db = True
                    f = open('{}/blast/{}'.format(filteredScrape.workdir, the_file))
                    for line in iter(f):
                        if line[0] == ">":
                            count_gi_file += 1
                if spn == key and file_type == "tobeblasted":
                    blasted = True
                    count_str_file += 1
            if blasted:
                if count + count_int != threshold:
                    assert count_str_file == count
            if db:
                if count + count_int != threshold:
                    assert count_gi_file == count_int
Beispiel #28
0
trfn = "tests/data/tiny_comb_ets/tiny_comb_ets.tre"
schema_trf = "newick"
id_to_spn = r"tests/data/tiny_comb_ets/nicespl.csv"

workdir = "tiny_comb_ets"
configfi = "tests/data/localblast.config"
otu_jsonfi = "{}/otu_dict.json".format(workdir)
threshold = 2
selectby = "blast"
downtorank = None

if not os.path.exists("{}".format(workdir)):
    os.makedirs("{}".format(workdir))

conf = ConfigObj(configfi)
ids = IdDicts(conf, workdir=workdir)

if os.path.exists(otu_jsonfi):
    print("load json")
    otu_json = json.load(open(otu_jsonfi))
else:
    otu_json = OtuJsonDict(id_to_spn, ids)
    json.dump(otu_json, open(otu_jsonfi, "w"))

wrappers.filter_data_run(seqaln,
                         mattype,
                         trfn,
                         schema_trf,
                         workdir,
                         threshold,
                         otu_jsonfi,
Beispiel #29
0
def test_blacklist():

    workdir = "tests/output/test_blacklist"
    configfi = "tests/data/test.config"

    # make one run without blacklist
    debug("run without blacklist")
    blacklist = None
    noblack = os.path.join(workdir, "noblacklist")
    absworkdir = os.path.abspath(noblack)
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))

    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    noblackScrape = FilterBlast(data_obj, ids)
    noblackScrape._blasted = 1
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        # print(dest)
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    noblackScrape.read_blast_wrapper()
    noblackScrape.remove_identical_seqs()
    noblackScrape.generate_streamed_alignment()

    # one run with blacklist
    debug("run with blacklist")

    blacklist = ['JX895340.1']
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape.blacklist = blacklist
    filteredScrape._blasted = 1
    if not os.path.exists(os.path.join(absworkdir, "current_blast_run/")):
        os.makedirs(os.path.join(absworkdir, "current_blast_run/"))
    src = "tests/data/precooked/fixed/tte_blast_files"
    src_files = os.listdir(src)
    for file_name in src_files:
        dest = os.path.join(absworkdir, "current_blast_run/")
        full_file_name = os.path.join(src, file_name)
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, dest)
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper()
    filteredScrape.remove_identical_seqs()
    filteredScrape.generate_streamed_alignment()

    print("RUN TESTS!")
    gi_l = []
    gi_l_2 = []
    for tax in filteredScrape.data.tre.taxon_namespace:
        gi_id = filteredScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l.append(gi_id)
    print(gi_l)
    for tax in noblackScrape.data.tre.taxon_namespace:
        # print(filteredScrape.data.otu_dict[tax.label])
        gi_id = noblackScrape.data.otu_dict[tax.label].get("^ncbi:accession")
        gi_l_2.append(gi_id)
    print(gi_l_2)
    for item in blacklist:
        assert item not in gi_l
        print("RUN TESTS2!")
        assert item in gi_l_2

        #     # print("seq was not added in blacklist run")
        #     print("inbetween step works")
# test if it removes blacklist gi from already added aln:
    print("run with later blacklist")

    # else:
    #     print("blacklist gi was added in previous run")
    # print("now we want to remove it.")
    len_before = (len(noblackScrape.data.tre.taxon_namespace))
    noblackScrape.blacklist = blacklist
    noblackScrape.generate_streamed_alignment()
    assert len_before - 1 == len(noblackScrape.data.tre.taxon_namespace)
Beispiel #30
0
configfi = "tests/data/localblast.config"

otu_jsonfi = "{}/otu_dict.json".format(workdir)

# change to your filtering criteria
threshold = 2
selectby = "blast"
downtorank = "species"
shared_blast_folder = "/home/blubb/Documents/gitdata/physcraper/shared_runs/"
ingroup_mrca = 723076
# setup the run
if not os.path.exists("{}".format(workdir)):
    os.makedirs("{}".format(workdir))

conf = ConfigObj(configfi, interactive=False)
ids = IdDicts(conf, workdir=workdir, mrca=ingroup_mrca)

if os.path.exists(otu_jsonfi):
    print("load json")
    otu_json = json.load(open(otu_jsonfi))
else:
    otu_json = OtuJsonDict(id_to_spn, ids)
    json.dump(otu_json, open(otu_jsonfi, "w"))

# select a wrapper function, depending on what you want to do, see short tutorial:
wrappers.filter_data_run(seqaln,
                         mattype,
                         trfn,
                         schema_trf,
                         workdir,
                         threshold,