Example #1
0
def test_owndata():
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    workdir = "tests/output/owndata"
    configfi = "tests/data/localblast.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)
    """Tests if your own input files will generate a data object of class AlignTreeTax
	"""

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=trfn,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    assert isinstance(data_obj, AlignTreeTax)
Example #2
0
def test_config():
    from physcraper import ConfigObj
    configfi = "tests/data/test.config"
    conf = ConfigObj(configfi, interactive=False)
    print(conf.__dict__.keys())

    if conf.blast_loc != "remote":
        expected_keys = [
            'seq_len_perc', 'num_threads', 'phylesystem_loc',
            'ncbi_parser_names_fn', 'ncbi_parser_nodes_fn', 'maxlen',
            'id_pickle', 'hitlist_size', 'gb_id_filename', 'delay', 'unmapped',
            'trim_perc', 'url_base', 'ott_ncbi', 'blast_loc', 'email',
            'e_value_thresh', 'blastdb'
        ]
    else:
        expected_keys = [
            'seq_len_perc', 'num_threads', 'phylesystem_loc', 'maxlen',
            'hitlist_size', 'gb_id_filename', 'delay', 'unmapped', 'trim_perc',
            'url_base', 'ott_ncbi', 'blast_loc', 'id_pickle', 'email',
            'e_value_thresh'
        ]

    assert len(conf.email.split("@")) == 2
    #    assert conf.url_base == None
    assert set(conf.__dict__.keys()) == set(expected_keys)
Example #3
0
def test_add_all():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    for key in filteredScrape.sp_d:
        if len(filteredScrape.sp_d[key]) <= treshold:
            filteredScrape.add_all(key)
    treshold_undermin = 0
    for key in filteredScrape.sp_d:
        for key2 in filteredScrape.sp_d[key]:
            if len(filteredScrape.sp_d[key]) <= treshold:
                if '^physcraper:status' in key2:
                    if key2['^physcraper:status'].split(
                            ' ')[0] not in filteredScrape.seq_filter:
                        if key2['^physcraper:last_blasted'] == '1800/01/01':
                            treshold_undermin += 1
    add_all_thresholdmin = filteredScrape.filtered_seq
    assert treshold_undermin == len(add_all_thresholdmin)
Example #4
0
def test_config():
    from physcraper import ConfigObj
    configfi = "tests/data/localblast.config"
    conf = ConfigObj(configfi, interactive=False)
    assert conf.email == '*****@*****.**'
    assert conf.url_base == None
    assert conf.__dict__.keys() == expected_keys
Example #5
0
def test():

    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    workdir = "tests/output/test_own_local"
    configfi = "tests/data/test.config"
    # configfi = "tests/data/aws.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    wrappers.own_data_run(seqaln, mattype, trfn, schema_trf, workdir,
                          otu_jsonfi, configfi)
Example #6
0
def test_no_mrca():
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    workdir = "tests/output/test_mrcalist_local"
    configfi = "tests/data/test.config"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    ingroup_mrca = None
    # setup the run
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    # print(ids.mrca_ott, ids.mrca_ncbi)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    filteredScrape = PhyscraperScrape(data_obj, ids, ingroup_mrca)
    filteredScrape.threshold = 5
    assert filteredScrape.mrca_ncbi == 18794
    
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    filteredScrape._blasted = 1
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    assert len(filteredScrape.new_seqs_otu_id) in [23,17] #Blurghhh, local vs remote searches get diffenrt number of seqs!
Example #7
0
def test_remove_taxa_aln_tre():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)

    len_aln_before = len(filteredScrape.data.aln.as_string('phylip'))
    len_tre_before = len(filteredScrape.data.tre.as_string(schema="newick"))
    namespace_before = len(filteredScrape.data.aln.taxon_namespace)
    namespace_tre_before = len(filteredScrape.data.tre.taxon_namespace)

    for tax in filteredScrape.data.aln.taxon_namespace:
        filteredScrape.data.remove_taxa_aln_tre(tax.label)
        break

    len_aln_after = len(filteredScrape.data.aln.as_string('phylip'))
    len_tre_after = len(filteredScrape.data.tre.as_string(schema="newick"))
    namespace_after = len(filteredScrape.data.aln.taxon_namespace)
    namespace_tre_after = len(filteredScrape.data.tre.taxon_namespace)

    assert len_aln_before != len_aln_after
    assert len_tre_before != len_tre_after
    assert namespace_before != namespace_after
    assert namespace_tre_before != namespace_tre_after
Example #8
0
def test_species_translation():
    spn = "Mephitis mephitis"
    info = get_ott_taxon_info(spn)
    if info:
        ottid, ottname, ncbi_id = info
    a = ottid == 231602

    tree_of_life.mrca(ott_ids=[ottid], wrap_response=False)

    ott_ids = [770315, 158484]
    ott_mrca = get_mrca_ott(ott_ids)
    b = ott_mrca == 312031

    workdir = "tests/output/tmp"
    configfi = "tests/data/test.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)
    """Tests if your own input files will generate a data object of class AlignTreeTax
	"""

    conf = ConfigObj(configfi, interactive=False)
    ids = IdDicts(conf, workdir=workdir)

    otu_json = OtuJsonDict(id_to_spn, ids)

    c = otu_json == expected_json
    assert a * b * c == 1
Example #9
0
def test_id_dicts():
    conf = ConfigObj(configfi, interactive=True)
    ids = IdDicts(conf, workdir=workdir)
    selection = random.sample(ids.ott_to_ncbi.keys(), 10)
    for ott_id in selection:
        ncbi_id = ids.ott_to_ncbi[ott_id]
        assert ids.ncbi_to_ott[ncbi_id] == ott_id
Example #10
0
def test_run_raxml():

    workdir = "tests/output/test_run_raxml"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)

    #load data
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    scraper = PhyscraperScrape(data_obj, ids)
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    scraper._blasted = 1

    # run needed functions
    # scraper.run_blast_wrapper()
    scraper.read_blast_wrapper(blast_dir=blast_dir)

    #	scraper.align_query_seqs()

    #	scraper.place_query_seqs()

    scraper.est_full_tree()
    # scraper.generate_streamed_alignment()
    assert os.path.exists("{}/RAxML_bestTree.{}".format(
        scraper.workdir, scraper.date))
Example #11
0
def test_sp_seq_d():

    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]

    gi_sp_d = []
    sp_d = filteredScrape.make_sp_dict()
    for key in sp_d:
        v = sp_d[key]
        for v2 in v:
            v2 = filteredScrape.data.otu_dict[v2]
            if '^physcraper:status' in v2:
                not_added = ['deleted', 'subsequence,', 'not']
                if v2['^physcraper:status'].split(' ')[0] not in not_added:
                    if '^ncbi:gi' in v2:
                        gi_sp_d.append(v2['^ncbi:accession'])
    user_sp_d = []
    for v in filteredScrape.sp_d.values():
        for v2 in v:
            v2 = filteredScrape.data.otu_dict[v2]
            if '^physcraper:status' in v2 or u'^physcraper:status' in v2:
                if v2['^physcraper:status'].split(
                        ' ')[0] not in filteredScrape.seq_filter:
                    if v2['^physcraper:last_blasted'] != '1800/01/01':
                        if '^user:TaxonName' in v2:
                            user_sp_d.append(v2['^user:TaxonName'])
                        elif '^ot:ottTaxonName' in v2:
                            user_sp_d.append(v2['^ot:ottTaxonName'])
    filteredScrape.make_sp_seq_dict()
    gi_sp_seq_d = []
    ott_sp_seq_d = []
    for v in filteredScrape.sp_seq_d.values():
        for k in v.keys():
            # print(k)
            if len(k.split('.')) >= 2:
                # if type(k) == int:
                gi_sp_seq_d.append(k)
            else:
                # if type(k) == str or type(k) == unicode:
                ott_sp_seq_d.append(k)
    # print(len(ott_sp_seq_d), len(user_sp_d), len(gi_sp_seq_d), len(gi_sp_d))
    assert len(ott_sp_seq_d) == len(user_sp_d)
    assert len(gi_sp_seq_d) == len(gi_sp_d)
Example #12
0
def test_add_local():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    if os.path.exists(otu_jsonfi_local):
        otu_json_local = json.load(open(otu_jsonfi_local))
    else:
        otu_json_local = OtuJsonDict(id_to_spn_addseq, ids)
        json.dump(otu_json_local, open(otu_jsonfi_local, "w"))

    sys.stdout.write("\ntest addLocal\n")

    # Prune sequences below a certain length threshold
    data_obj.prune_short()
    data_obj.write_files()
    data_obj.write_labelled(label='^ot:ottTaxonName', add_gb_id=True)
    data_obj.write_otus("otu_info", schema='table')
    data_obj.dump()

    sys.stdout.write("setting up id dictionaries\n")
    sys.stdout.flush()

    ids = IdDicts(conf, workdir=workdir)

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape.blacklist = blacklist

    if add_local_seq is not None:
        filteredScrape.unpublished = True
    if filteredScrape.unpublished is True:  # use unpublished data
        # filteredScrape.unpublished = True
        filteredScrape.data.unpubl_otu_json = otu_json_local
        filteredScrape.write_unpubl_blastdb(add_local_seq)

        # filteredScrape.make_otu_dict_entry_unpubl()
        filteredScrape.run_blast_wrapper()
        filteredScrape.read_blast_wrapper()
        filteredScrape.remove_identical_seqs()

    test = False
    for key in filteredScrape.data.otu_dict.keys():
        if '^ncbi:title' in filteredScrape.data.otu_dict[key].keys():
            if filteredScrape.data.otu_dict[key][
                    '^ncbi:title'] == "unpublished":
                test = True
                break
    assert test == True
Example #13
0
def sync_ncbi(configfi):
    conf = ConfigObj(configfi)
    subprocess.call(
        [
            "rsync",
            "av",
            "ftp.ncbi.nih.gov::pub/taxonomy/gi_taxid_nucl.dmp.gz",
            "{}/gi_taxid_nucl.dmp.gz".format(conf.ncbi_dmp),
        ]
    )
    subprocess.call(["gunzip", "{}/gi_taxid_nucl.dmp.gz".format(dir)])
Example #14
0
def test_remove_id_seq():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape =  FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1

    #############################

    id_seq = ["TCGAAACCTGCATAGCAGAACGACCT-GTGAACATGTAAAAACAATTGGG-TGTTCTAAGTATCGGGCTCTTGTTCGATTTCTA-GGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGT-CTAAGGACGTCACGTCGACG-CAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGC--TT-GTTCCATGCATT--GCCGTT--CGCGGTGATTGCATTGAAACTTGCTTCTTTATAA-TTCATAAACGACTCTCGG-CAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCC-GAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCC-CCATCAC---ACCTCTT-GACGGGGATGTTTGAATGGGGA-CGGAGATTGGTCTCCCGTTCCT---AAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCT--------------TATCGAGTTGTGTG--TTCCAAGAAGTAA-GGAATATCTCTTTAACGACCC-TAAAGTGTTGTCTCATG-ACGATGCTTCGACTGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC",
                "TCGAAACCTGCATAGCAGAACGACCTGTGAACATGTAAAAACAATTGGGTGTTCTAAGTATCGGGCTCTTGTTCGATTTCTAGGATGCCATGTTGACGTGCGTCTTTGGCAAGCCCCTTGGGTGTCTAAGGACGTCACGTCGACGCAACAACAAACCCCCGGCACGGCATGTGCCAAGGAAATATAAACTTAAGAAGGGCTTGTTCCATGCATTGCCGTTCGCGGTGATTGCATTGAAACTTGCTTCTTTATAATTCATAAACGACTCTCGGCAACGGATATCTCGGCTCACGCATCGATGAAGAACGTAGCAAAATGCGATACTTGGTGTGAATTGCAGAATCCCGTGAACCATCGAGTTTTTGAACGCAAGTTGCGCCCGAAGCCTTTTGGTTGAGGGCACGTCTGCCTGGGCGTCACATATCGCGTCGCCCCCATCACACCTCTTGACGGGGATGTTTGAATGGGGACGGAGATTGGTCTCCCGTTCCTAAGGTGCGGTTGCCTGAATTTTGAGTCCTCTTCGACGGACGCACGATTAGTGGTGGTTGACAAGACCTTCTTATCGAGTTGTGTGTTCCAAGAAGTAAGGAATATCTCTTTAACGACCCTAAAGTGTTGTCTCATGACGATGCTTCGACTGCGCGCGCGC"
                ]

    # print("start test")
    tmp_dict = dict((taxon.label, filteredScrape.data.aln[taxon].symbols_as_string()) for taxon in filteredScrape.data.aln)
    old_seqs = tmp_dict.keys()
    avg_seqlen = sum(filteredScrape.data.orig_seqlen)/len(filteredScrape.data.orig_seqlen)
    assert filteredScrape.config.seq_len_perc <= 1
    seq_len_cutoff = avg_seqlen*filteredScrape.config.seq_len_perc
    count=1

    for item in id_seq:
        if len(item.replace("-", "").replace("N", "")) > seq_len_cutoff:
            ott = "OTT_{}".format(count)
            count += 1
            otu_id = ott
            filteredScrape.data.otu_dict[otu_id] = {}
            filteredScrape.data.otu_dict[otu_id]['^ncbi:gi'] = 1061375300
            filteredScrape.data.otu_dict[otu_id]['^ncbi:accession'] =   "KX494441"
            filteredScrape.data.otu_dict[otu_id]['^ncbi:title'] = "some random title"
            filteredScrape.data.otu_dict[otu_id]['^ncbi:taxon'] = 0101010101
            filteredScrape.data.otu_dict[otu_id]['^ot:ottId'] = ott
            filteredScrape.data.otu_dict[otu_id]['^physcraper:status'] = "query"
            filteredScrape.data.otu_dict[otu_id]['^ot:ottTaxonName'] = "Senecio vulgaris"
            filteredScrape.data.otu_dict[otu_id]['^physcraper:last_blasted'] = "1800/01/01"
            filteredScrape.seq_dict_build(item, otu_id, tmp_dict)
    for tax in old_seqs:
        try:
            del tmp_dict[tax]
        except KeyError:
            pass
    filteredScrape.new_seqs_otu_id = tmp_dict
    expected_add = 1
    assert expected_add == len(filteredScrape.new_seqs_otu_id)
    sys.stdout.write("todo: add check that newly added seq are checked. they are, but there is no test")
Example #15
0
def test_load_otol_data():
    study_id = "pg_873"
    tree_id = "tree1679"
    seqaln = "tests/data/minitest.fas"
    mattype = "fasta"
    workdir = "tests/output/opentree_unmappedtaxa"
    absworkdir = os.path.abspath(workdir)

    configfi = "tests/data/test.config"
    ingroup_mrca = None
    if not os.path.exists(workdir):
        os.mkdir(workdir)
    conf = ConfigObj(configfi)
    data_obj = wrappers.load_otol_data(conf, ingroup_mrca, mattype, seqaln,
                                       study_id, tree_id, workdir)
    assert data_obj
Example #16
0
def test():
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi, interactive=False)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    # that's the main function
    wrappers.own_data_run(seqaln, mattype, trfn, schema_trf, workdir,
                          otu_jsonfi, configfi)
Example #17
0
def test_sp_d():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
Example #18
0
def test_write_blast():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    for taxonID in filteredScrape.sp_d:
        if len(filteredScrape.sp_seq_d[taxonID]) > treshold:
            blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0]
            seq = filteredScrape.sp_seq_d[taxonID][blast_seq]
            local_blast.write_filterblast_files(workdir, taxonID, seq)
            blast_db = filteredScrape.sp_seq_d[taxonID].keys()[1:]
            for blast_key in blast_db:
                seq = filteredScrape.sp_seq_d[taxonID][blast_key]
                local_blast.write_filterblast_files(workdir,
                                                    blast_key,
                                                    seq,
                                                    db=True,
                                                    fn=str(taxonID))
            break

    blast_file_blast = "{}/blast/{}_tobeblasted".format(workdir, taxonID)
    # print(blast_file_blast)
    blast_file_db = "{}/blast/{}_db".format(workdir, taxonID)
    # print(blast_file_db, blast_file_blast)
    if os.path.exists(blast_file_blast):
        with open(blast_file_blast) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
    if os.path.exists(blast_file_db):
        with open(blast_file_db) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
Example #19
0
def test_trim():
  #------------------------
  seqaln= "tests/data/tiny_test_example/test_extralongseq.fas"
  mattype="fasta"
  treefile= "tests/data/tiny_test_example/test.tre"
  schema_trf = "newick"
  workdir="tests/output/test_trim"
  configfi = "tests/data/test.config"
  id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
  otu_jsonfi = "{}/otu_dict.json".format(workdir)



  if not os.path.exists("{}".format(workdir)):
          os.makedirs("{}".format(workdir))

  conf = ConfigObj(configfi, interactive=False)
  ids = IdDicts(conf, workdir=workdir)

  if os.path.exists(otu_jsonfi):
      print("load json")
      otu_json = json.load(open(otu_jsonfi))
  else:
      otu_json = OtuJsonDict(id_to_spn, ids)
      json.dump(otu_json, open(otu_jsonfi,"w"))


  data_obj = generate_ATT_from_files(seqaln=seqaln, 
                                   mattype=mattype, 
                                   workdir=workdir,
                                   config_obj=conf,
                                   treefile=treefile,
                                   schema_trf = schema_trf,
                                   otu_json=otu_jsonfi,
                                   ingroup_mrca=None)

  for tax, seq in data_obj.aln.items():
  	len_start = len(seq)
  	next
  data_obj.trim()
  for tax, seq in data_obj.aln.items():
  	len_end = len(seq)

  assert len_start != len_end
Example #20
0
def test():
    # define here your files
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    workdir = "tests/output/tiny_filter_own2"
    configfi = "tests/data/remote.config"
    otu_jsonfi = "{}/otu_dict.json".format(workdir)

    # change to your filtering criteria
    threshold = 2
    selectby = "blast"
    downtorank = "species"
    ingroup_mrca = 723076

    # setup the run
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi)
    ids = IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        print("load json")
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    # select a wrapper function, depending on what you want to do, see short tutorial:
    wrappers.filter_data_run(seqaln,
                             mattype,
                             trfn,
                             schema_trf,
                             workdir,
                             threshold,
                             otu_jsonfi,
                             configfi,
                             selectby=selectby,
                             downtorank=downtorank,
                             ingroup_mrca=ingroup_mrca)
Example #21
0
def filter_data_run(seqaln,
                    mattype,
                    trfn,
                    schema_trf,
                    workdir,
                    threshold,
                    id_to_spn,
                    configfi,
                    selectby="blast",
                    downtorank=None,
                    blacklist=None,
                    add_unpubl_seq=None,
                    id_to_spn_addseq_json=None,
                    ingroup_mrca=None,
                    shared_blast_folder=None):
    """looks for pickeled file to continue run, or builds and runs 
    new analysis for as long as new seqs are found. 
    This uses the FilterBlast subclass to be able to filter the blast output.
    """
    license_print()
    debug("Debugging mode is on")
    print(workdir)
    print(os.path.exists(workdir))
    if not os.path.exists(workdir):
        print("make wd")
        os.makedirs(workdir)
    conf = ConfigObj(configfi)
    ids = load_ids_obj(conf, workdir)

    make_otujsondict(id_to_spn, workdir, ids)
    # make json file for unpublished database
    if add_unpubl_seq is not None:
        make_otujsondict(id_to_spn_addseq_json, workdir, ids, local=True)

    # Generate an linked Alignment-Tree-Taxa object
    data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir,
                             ingroup_mrca)
    filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj,
                                   downtorank, id_to_spn_addseq_json, ids,
                                   selectby, shared_blast_folder, threshold,
                                   ingroup_mrca)
    save_copy_code(workdir)
    return filteredScrape
Example #22
0
def test_filter_length():

    workdir = "tests/output/test_selectbylength"
    absworkdir = os.path.abspath(workdir)
    conf = ConfigObj("tests/data/test.config", interactive=False)
    threshold = 2
    selectby = "length"
    downtorank = "species"
    add_unpubl_seq = None
    blacklist = None

    id_to_spn_addseq_json = None
    ingroup_mrca = None
    shared_blast_folder = None

    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape.blacklist = blacklist

    sys.stdout.write("BLASTing input sequences\n")
    if shared_blast_folder:
        filteredScrape.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    # filteredScrape.run_blast_wrapper()
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.remove_identical_seqs()
    filteredScrape.dump()
    sys.stdout.write("Filter the sequences\n")
    length_unfiltered = len(filteredScrape.new_seqs_otu_id)

    #    if threshold is not None:
    #        filteredScrape.filter_seqs()

    length_filtered = len(filteredScrape.new_seqs)
Example #23
0
def add_unpubl_to_backbone(seqaln,
                           mattype,
                           trfn,
                           schema_trf,
                           workdir,
                           sp_info_jsonfi,
                           configfi,
                           add_unpubl_seq,
                           id_to_spn_addseq_json,
                           selectby=None,
                           downtorank=None,
                           threshold=None,
                           blacklist=None,
                           ingroup_mrca=None,
                           shared_blast_folder=None):
    """
    This uses the FilterBlast subclass to be able to filter the blast output.
    It adds unpublished data to an input tree (evalue should be higher than usual).
    Backbone will not be updated
    """
    license_print()

    # read the config file into a configuration object
    conf = ConfigObj(configfi)

    # Generate an linked Alignment-Tree-Taxa object
    data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir,
                             ingroup_mrca)
    ids = load_ids_obj(conf, workdir)
    filteredScrape = PS_filter_run(add_unpubl_seq,
                                   blacklist,
                                   data_obj,
                                   downtorank,
                                   id_to_spn_addseq_json,
                                   ids,
                                   selectby,
                                   shared_blast_folder,
                                   threshold,
                                   ingroup_mrca,
                                   backbone=True)
    save_copy_code(workdir)
    return filteredScrape
Example #24
0
def filter_OTOL(
        study_id,
        tree_id,
        seqaln,
        mattype,
        workdir,
        configfi,
        threshold,
        selectby="blast",
        downtorank=None,
        blacklist=None,
        add_unpubl_seq=None,  # path to local seq
        id_to_spn_addseq_json=None,
        ingroup_mrca=None,
        shared_blast_folder=None):
    """looks for pickeled file to continue run, or builds and runs
    new analysis for as long as new seqs are found.

    This uses the FilterBlast subclass to be able to filter the blast output using data from OToL."""
    license_print()

    debug("Debugging mode is on")
    if not os.path.exists(workdir):
        os.makedirs(workdir)
    # read the config file into a configuration object
    conf = ConfigObj(configfi)
    # Generate an linked Alignment-Tree-Taxa object
    data_obj = load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id,
                              tree_id, workdir)
    ids = load_ids_obj(conf, workdir)

    # make json file for unpublished database
    if add_unpubl_seq is not None:
        make_otujsondict(id_to_spn_addseq_json, workdir, ids, local=True)

    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    filteredScrape = PS_filter_run(add_unpubl_seq, blacklist, data_obj,
                                   downtorank, id_to_spn_addseq_json, ids,
                                   selectby, shared_blast_folder, threshold,
                                   ingroup_mrca)
    save_copy_code(workdir)
    return filteredScrape
Example #25
0
def test_sp_d():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = FilterBlast(data_obj, ids)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    gi_data_otu_dict_added = []
    for v in filteredScrape.data.otu_dict.values():
        if '^ncbi:gi' in v:
            if (v['^physcraper:status'].split(' ')[0]
                    not in filteredScrape.seq_filter):
                gi_data_otu_dict_added.append(v['^ncbi:gi'])
    gi_sp_d = []
    for key in filteredScrape.sp_d:
        v = filteredScrape.sp_d[key]
        for v2 in v:
            if '^ncbi:gi' in v2:
                gi_sp_d.append(v2['^ncbi:gi'])
    user_data_otu_dict = []
    for v in filteredScrape.data.otu_dict.values():
        if '^user:TaxonName' in v:
            user_data_otu_dict.append(v['^user:TaxonName'])
    user_sp_d = []
    for v in filteredScrape.sp_d.values():
        for v2 in v:
            if '^user:TaxonName' in v2:
                user_sp_d.append(v2['^user:TaxonName'])
    assert sorted(gi_data_otu_dict_added) == sorted(gi_sp_d)
    assert sorted(user_data_otu_dict) == sorted(user_sp_d)
Example #26
0
def test_add_all():
    conf = ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    filteredScrape.threshold = threshold
    filteredScrape.read_blast_wrapper(
        blast_dir="tests/data/precooked/fixed/tte_blast_files")
    filteredScrape.seq_filter = [
        'deleted', 'subsequence,', 'not', "removed", "deleted,"
    ]
    filteredScrape.remove_identical_seqs()
    sp_d = filteredScrape.make_sp_dict(filteredScrape.new_seqs_otu_id)
    assert len(sp_d) == 5
    for taxon in sp_d:
        assert len(sp_d[taxon]) <= threshold
Example #27
0
def test_load_own_data():
    seqaln = "tests/data/tiny_test_example/test.fas"
    mattype = "fasta"
    trfn = "tests/data/tiny_test_example/test.tre"
    schema_trf = "newick"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    workdir = "tests/output/impls_mrcalist_local"
    configfi = "tests/data/test.config"
    ingroup_mrca = None

    if not os.path.exists(workdir):
        os.mkdir(workdir)

    conf = ConfigObj(configfi)

    ids = wrappers.load_ids_obj(conf, workdir)
    wrappers.make_otujsondict(id_to_spn, workdir, ids)

    data_obj = wrappers.load_own_data(conf, seqaln, mattype, trfn, schema_trf,
                                      workdir, ingroup_mrca)
    assert data_obj
Example #28
0
def test():

    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = ConfigObj(configfi, interactive=False)
    ids = IdDicts(conf, workdir=workdir)

    otu_json = OtuJsonDict(id_to_spn, ids)
    json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefile,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for tax, seq in data_obj.aln.items():
        len_start = len(seq)

    data_obj.trim()

    for tax, seq in data_obj.aln.items():
        len_end = len(seq)

    assert len_start == len_end

    for tax, seq in data_obj.aln.items():
        len_start = len(seq)

    data_obj.config.trim_perc = 0.5
    data_obj.trim()

    for tax, seq in data_obj.aln.items():
        len_end = len(seq)

    assert len_start > len_end
Example #29
0
def own_data_run(seqaln,
                 mattype,
                 trfn,
                 schema_trf,
                 workdir,
                 id_to_spn,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """This is the wrapper function to start a PhyScraper standard run with your own data.
    You need:
         seqaln = path to sequence alignment file
         mattype = the format name of you alignment
         trfn = path to file with the phylogeny to update
         schema_trf = format type of your phylogeny
         workdir = define where your analysis files shall be stored
         sp_info_jsonfi = a json file which has the otu_dict stored, which is generated by the OtuJsonDict function
                            (usually, just leave it like it is in the example scripts.).
         configfi = path to your config file
         ingroup_mrca = not necessary, if you want to limit your run to a certain clade, give the OpenTree ID here,
                        can be obtained bu running: python scripts/get_ott.py ingroup_name
         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    license_print()
    debug("Debugging mode is on")
    if not os.path.exists(workdir):
        os.mkdir(workdir)
    conf = ConfigObj(configfi)
    ids = load_ids_obj(conf, workdir)

    make_otujsondict(id_to_spn, workdir, ids)
    data_obj = load_own_data(conf, seqaln, mattype, trfn, schema_trf, workdir,
                             ingroup_mrca)
    # Mapping identifiers between original data and NCBI requires an identifier dict object
    # scraper = PhyscraperScrape(data_obj, ids)
    scraper = PS_standard_run(data_obj, ids, shared_blast_folder)
    save_copy_code(workdir)
    return 1
Example #30
0
def test_generate_ATT_from_file():

    seqaln = "tests/data/input.fas"
    mattype = "fasta"
    workdir = "tests/fromfile"
    treefile = "tests/data/input.tre"
    otu_jsonfi = "tests/data/otu_dict.json"
    schema_trf = "newick"
    configfi = "tests/data/test.config"

    sys.stdout.write("\nTesting 'generate_ATT_from_files (fromfile.py)'\n")

    conf = ConfigObj(configfi, interactive=False)

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefile,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi)

    data_obj == True