Ejemplo n.º 1
0
def test_0():
    if os.path.isfile("tests/data/precooked/otol_scraper.p"):
        # physcraper.debug(os.getcwd())
        conf = physcraper.ConfigObj(configfi, interactive=False)
        # physcraper.debug("conf")
        conf.unmapped = 'keep'
        # physcraper.debug("set unmapped")
        data_obj = pickle.load(
            open("tests/data/precooked/otol_tiny_dataobj.p", 'rb'))
        data_obj.workdir = absworkdir
        # physcraper.debug("dataobj loaded")
        ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
        ids.acc_ncbi_dict = pickle.load(
            open("tests/data/precooked/otol_tiny_gi_map.p", "rb"))
        # physcraper.debug("ids loaded")
        scraper = pickle.load(open("tests/data/precooked/otol_scraper.p",
                                   "rb"))
        # physcraper.debug("scraper loaded")
        # scraper2 = pickle.load(open("tests/data/precooked/otol_scraper.p", "rb"))
        num_keep = len(scraper.data.aln.taxon_namespace)
        # physcraper.debug('num_keep')

        # physcraper.debug(num_keep)
    # except:
    else:
        sys.stdout.write("\n\n No files present\n\n")
        conf = physcraper.ConfigObj(configfi)
        conf.unmapped = 'keep'
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        data_obj = physcraper.generate_ATT_from_phylesystem(
            aln=aln,
            workdir=workdir,
            study_id=study_id,
            tree_id=tree_id,
            phylesystem_loc=conf.phylesystem_loc)
        # physcraper.debug(len(data_obj.aln.taxon_namespace))
        pickle.dump(data_obj,
                    open("tests/data/precooked/otol_tiny_dataobj.p", "wb"))
        ids = physcraper.IdDicts(conf, workdir=workdir)
        # physcraper.debug(os.getcwd())
        pickle.dump(ids.acc_ncbi_dict,
                    open("tests/data/precooked/otol_tiny_gi_map.p", "wb"))
        data_obj.write_files()
        scraper = physcraper.PhyscraperScrape(data_obj, ids)
        # physcraper.debug(len(scraper.data.aln.taxon_namespace))
        # physcraper.debug("scraper obj made")
        pickle.dump(scraper.config,
                    open("tests/data/precooked/otol_conf.p", "wb"))
        pickle.dump(scraper, open("tests/data/precooked/otol_scraper.p", "wb"))
        num_keep = len(scraper.data.aln.taxon_namespace)
Ejemplo n.º 2
0
def test_prune_short():
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = physcraper.ConfigObj(configfi, interactive=False)
    ids = physcraper.IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = physcraper.OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = physcraper.generate_ATT_from_files(seqaln=seqaln,
                                                  mattype=mattype,
                                                  workdir=workdir,
                                                  treefile=treefile,
                                                  schema_trf=schema_trf,
                                                  otu_json=otu_jsonfi,
                                                  ingroup_mrca=None)

    len_before = len(data_obj.tre.taxon_namespace)
    data_obj.prune_short(0.9)
    len_after = len(data_obj.tre.taxon_namespace)
    assert len_before > len_after
Ejemplo n.º 3
0
def test_run_filter_blast():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = physcraper.FilterBlast(data_obj, ids)

    blast_db = "otuSlagascanus"
    blast_seq = "otuSlagascanus"

    if not os.path.exists("{}/blast".format(filteredScrape.data.workdir)):
        os.makedirs("{}/blast/".format(filteredScrape.data.workdir))
    path1 = '{}/tests/data/precooked/fixed/select-blast/*'.format(os.getcwd())

    path2 = "{}/blast/".format(filteredScrape.data.workdir)
    cmd = 'cp -r ' + path1 + ' ' + path2
    os.system(cmd)

    local_blast.run_filter_blast(filteredScrape.data.workdir, blast_seq,
                                 blast_db)
    blast_out = "{}/blast/output_otuSlagascanus_tobeblasted.xml".format(
        workdir)

    if os.path.exists(blast_out):
        open(blast_out)
Ejemplo n.º 4
0
def test_prune_short():
    if not os.path.exists("{}".format(workdir)):
        os.makedirs("{}".format(workdir))

    conf = physcraper.ConfigObj(configfi, interactive=False)
    conf.blast_loc = 'remote'  #saves time over loading names and nodes, and they aren't used here

    ids = physcraper.IdDicts(conf, workdir=workdir)

    if os.path.exists(otu_jsonfi):
        otu_json = json.load(open(otu_jsonfi))
    else:
        otu_json = physcraper.OtuJsonDict(id_to_spn, ids)
        json.dump(otu_json, open(otu_jsonfi, "w"))

    data_obj = physcraper.generate_ATT_from_files(seqaln=seqaln,
                                                  mattype=mattype,
                                                  workdir=workdir,
                                                  config_obj=conf,
                                                  treefile=treefile,
                                                  schema_trf=schema_trf,
                                                  otu_json=otu_jsonfi,
                                                  ingroup_mrca=None)

    data_obj.config.seq_len_perc = 0.9
    len_before = len(data_obj.tre.taxon_namespace)
    data_obj.prune_short()
    len_after = len(data_obj.tre.taxon_namespace)
    assert len_before > len_after
Ejemplo n.º 5
0
def test_read_local_blast():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    # print("prepare test")
    for taxonID in filteredScrape.sp_d:
        if len(filteredScrape.sp_seq_d[taxonID]) > treshold:
            # print(taxonID)
            blast_seq = filteredScrape.sp_seq_d[taxonID].keys()[0]
            seq = filteredScrape.sp_seq_d[taxonID][blast_seq]
            local_blast.write_filterblast_query(filteredScrape.workdir,
                                                taxonID,
                                                seq,
                                                fn=str(taxonID))
            # print(filteredScrape.sp_seq_d[taxonID].keys()[1:] )
            blast_db = [
                item for item in filteredScrape.sp_seq_d[taxonID].keys()[1:]
                if len(item.split(".")) >= 2
            ]
            # print(blast_db)
            for blast_key in blast_db:
                seq = filteredScrape.sp_seq_d[taxonID][blast_key]
                local_blast.write_filterblast_db(filteredScrape.workdir,
                                                 blast_key,
                                                 seq,
                                                 fn=str(taxonID))
            break

    # print(taxonID)

    blast_db = taxonID
    blast_seq = taxonID
    key = taxonID

    local_blast.run_filter_blast(filteredScrape.workdir, blast_seq, blast_db)
    local_blast.read_filter_blast(filteredScrape.workdir,
                                  filteredScrape.sp_seq_d[key], blast_db)

    blast_out = "{}/blast/output_{}_tobeblasted.xml".format(workdir, key)

    if os.path.exists(blast_out):
        with open(blast_out) as f:
            first_line = f.readline()
            assert len(first_line.strip()) != 0
Ejemplo n.º 6
0
def test_select_seq_by_local_blast():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))


    filteredScrape =  physcraper.FilterBlast(data_obj, ids)
    filteredScrape.add_setting_to_self(downtorank, threshold)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    ##this is the code of the first part of how many seq to keep. if threshold is bigger than number of seq for sp, just add all
    # print("start test")
    count = 0
    for tax_id in filteredScrape.sp_d:
        count_dict = filteredScrape.count_num_seq(tax_id)
        if count_dict["new_taxon"]:
            if count_dict["query_count"] < threshold:
                count += count_dict["query_count"]
            if count_dict["query_count"] > threshold:
                count += threshold
        if count_dict["new_taxon"] is False:
            if count_dict["query_count"] >= 1:
                if count_dict["seq_present"] < threshold:
                    count += threshold-count_dict["seq_present"]
                if count_dict["seq_present"] > threshold:
                    count += 0
    filteredScrape.how_many_sp_to_keep(threshold, selectby)

    assert count == len(filteredScrape.filtered_seq) and count>0
  
# #added before
# #[429489224, 429489233, 429489188]
# {'^ncbi:taxon': 1268591, '^ncbi:title': 'Senecio scopolii subsp. scopolii clone JC4715-6 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_scopolii_subsp._scopolii', '^physcraper:status': 'query', '^ot:ottId': 114544, '^ncbi:accession': 'JX895389.1', '^ncbi:gi': 429489224, '^physcraper:last_blasted': '1800/01/01'}
# {'^ncbi:taxon': 1268591, '^ncbi:title': 'Senecio scopolii subsp. scopolii clone JC4715-15 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_scopolii_subsp._scopolii', '^physcraper:status': 'query', '^ot:ottId': 114544, '^ncbi:accession': 'JX895398.1', '^ncbi:gi': 429489233, '^physcraper:last_blasted': '1800/01/01'}
# {'^ncbi:taxon': 1268580, '^ncbi:title': 'Senecio lagascanus clone JC5600-6 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_lagascanus', '^physcraper:status': 'query', '^ot:ottId': 640718, '^ncbi:accession': 'JX895353.1', '^ncbi:gi': 429489188, '^physcraper:last_blasted': '1800/01/01'}


# [u'JX895398.1', u'JX895353.1', u'JX895392.1', 'JX895513.1', 'JX895264.1']

# ## now only one scopolii
# 1268590: [{'^ncbi:taxon': 1268590, '^ncbi:title': 'Senecio scopolii subsp. floccosus 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_scopolii_subsp._floccosus', '^physcraper:status': 'query', '^ot:ottId': 114541, '^ncbi:accession': 'JX895513.1', '^ncbi:gi': 429489348, '^physcraper:last_blasted': '1800/01/01'}],
# 1268581: {'^ncbi:taxon': 1268581, '^ncbi:title': 'Senecio lopezii clone JC3604-12 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_lopezii', '^physcraper:status': 'query', '^ot:ottId': 688688, '^ncbi:accession': 'JX895264.1', '^ncbi:gi': 429489099, '^physcraper:last_blasted': '1800/01/01'}
Ejemplo n.º 7
0
def test_write_outputinfo():
    workdir = "tests/output/test_write_output_files"
    configfi = "tests/data/test.config"
    downtorank = None
    absworkdir = os.path.abspath(workdir)

    fn_otu = os.path.join(absworkdir, "otu_seq_info.csv")
    fn_sampling = os.path.join(absworkdir, "taxon_sampling.csv")

    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))
    filteredScrape = PhyscraperScrape(data_obj, ids)
    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    # filteredScrape.sp_dict(downtorank)
    # filteredScrape.make_sp_seq_dict()

    filteredScrape.align_query_seqs()

    wrappers.write_out_files(filteredScrape, downtorank)

    with open(fn_otu) as fn:
        line = fn.readline()
        cnt = 1
        while cnt <= 5:
            line = fn.readline()
            cnt += 1
            assert type(line) == str
            assert line.split(",") >= 2

    with open(fn_sampling) as fn:
        line = fn.readline()
        cnt = 1
        while cnt <= 5:
            line = fn.readline()
            cnt += 1
            assert type(line) == str
            assert line.split(",") >= 2
Ejemplo n.º 8
0
def test_unmapped():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    conf.unmapped = 'remove'

    data_obj = pickle.load(
        open("tests/data/precooked/otol_tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir

    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)

    scraper2 = physcraper.PhyscraperScrape(data_obj, ids)
    num_remove = len(scraper2.data.aln.taxon_namespace)
    dict_id = 0
    for tax in scraper.data.aln.taxon_namespace:
        if '^ot:ottId' in scraper.data.otu_dict[tax.label]:
            dict_id = dict_id + 1
    # print(num_remove, num_keep, dict_id)
    assert num_remove <= num_keep - 1
    assert num_keep == dict_id
Ejemplo n.º 9
0
sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n")
conf = physcraper.ConfigObj(configfi, interactive=False)
print "1. {}".format(conf.email)
      
    
aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln,
                                                    workdir=workdir,
                                                    study_id = study_id,
                                                    tree_id = tree_id,
                                                    phylesystem_loc = conf.phylesystem_loc)



ids =  physcraper.IdDicts(conf, workdir=workdir)


print "3. {}".format(ids.config.email)


data_obj.prune_short()
assert len(data_obj.aln) == 9
data_obj.write_files()
try:
    scraper = physcraper.PhyscraperScrape(data_obj, ids)
    scraper.run_blast_wrapper()
    scraper.read_blast_wrapper()
    scraper.remove_identical_seqs()
    scraper.generate_streamed_alignment()
    sys.stdout.write("\nTest opentree_scrape.py (round 1) passed\n")
Ejemplo n.º 10
0
def test_calculate_mean_sd():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(
        open("tests/data/precooked/tiny_acc_map.p", "rb"))

    filteredScrape = physcraper.FilterBlast(data_obj, ids)

    # test begins
    fn = 'Senecio_scopolii_subsp._scopolii'
    # partly copy of read_local_blast_query
    general_wd = os.getcwd()
    if not os.path.exists(os.path.join(filteredScrape.workdir, "blast")):
        os.makedirs(os.path.join(filteredScrape.workdir, "blast"))

    fn_path = './tests/data/precooked/fixed/local-blast/{}'.format(fn)
    fn_path = os.path.abspath(fn_path)
    print(fn_path)
    os.chdir(os.path.join(filteredScrape.workdir, "blast"))
    local_blast.run_filter_blast(filteredScrape.workdir,
                                 fn_path,
                                 fn_path,
                                 output=os.path.join(
                                     filteredScrape.workdir,
                                     "blast/output_{}.xml".format(fn)))

    output_blast = os.path.join(filteredScrape.workdir,
                                "blast/output_{}.xml".format(fn))
    xml_file = open(output_blast)
    os.chdir(general_wd)
    blast_out = NCBIXML.parse(xml_file)
    hsp_scores = {}
    add_hsp = 0
    for record in blast_out:
        for alignment in record.alignments:
            for hsp in alignment.hsps:
                gi = int(alignment.title.split(" ")[1])
                hsp_scores[gi] = {
                    "hsp.bits": hsp.bits,
                    "hsp.score": hsp.score,
                    "alignment.length": alignment.length,
                    "hsp.expect": hsp.expect
                }
                add_hsp = add_hsp + float(hsp.bits)
    # make values to select for blast search, calculate standard deviation, mean
    mean_sed = local_blast.calculate_mean_sd(hsp_scores)
    sum_hsp = len(hsp_scores)
    mean = (add_hsp / sum_hsp)
    sd_all = 0
    for item in hsp_scores:
        val = hsp_scores[item]["hsp.bits"]
        sd = (val - mean) * (val - mean)
        sd_all += sd
    sd_val = sqrt(sd_all / sum_hsp)
    # print((sd_val, 4), round(mean_sed['sd'], 4))
    # print(mean,4), round(mean_sed['mean'], 4)

    assert round(sd_val, 4) == round(mean_sed['sd'], 4)
    assert round(mean, 4) == round(mean_sed['mean'], 4)
def test_select_seq_by_local_blast():
    conf = physcraper.ConfigObj(configfi, interactive=False)
    data_obj = pickle.load(open("tests/data/precooked/tiny_dataobj.p", 'rb'))
    data_obj.workdir = absworkdir
    ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
    ids.acc_ncbi_dict = pickle.load(open("tests/data/precooked/tiny_acc_map.p", "rb"))


    filteredScrape =  FilterBlast(data_obj, ids)
    filteredScrape.add_setting_to_self(downtorank, threshold)

    filteredScrape._blasted = 1
    blast_dir = "tests/data/precooked/fixed/tte_blast_files"
    # filteredScrape.acc_list_mrca = pickle.load(open("tests/data/precooked/acc_list_mrca.p", 'rb'))
    filteredScrape.read_blast_wrapper(blast_dir=blast_dir)
    filteredScrape.remove_identical_seqs()
    filteredScrape.sp_dict(downtorank)
    filteredScrape.make_sp_seq_dict()

    ##this is the code of the first part of how many seq to keep. if threshold is bigger than number of seq for sp, just add all
    # print("start test")
    count = 0
    for tax_id in filteredScrape.sp_d:
        count_dict = filteredScrape.count_num_seq(tax_id)
        if count_dict["new_taxon"]:
            if count_dict["query_count"] < threshold:
                count += count_dict["query_count"]
            if count_dict["query_count"] > threshold:
                count += threshold
        if count_dict["new_taxon"] is False:
            if count_dict["query_count"] >= 1:
                if count_dict["seq_present"] < threshold:
                    count += threshold-count_dict["seq_present"]
                if count_dict["seq_present"] > threshold:
                    count += 0
    #########
    # count refelcts what should be added, but through threshold the actual number might be lower
    # copy here from "select_seq_by_local_blast"
    for tax_id in filteredScrape.sp_d:
        count_dict = filteredScrape.count_num_seq(tax_id)
        seq_present = count_dict["seq_present"]
        query_count = count_dict["query_count"]
        new_taxon = count_dict["new_taxon"]
        

        seq_d = filteredScrape.sp_seq_d[tax_id]
        fn = tax_id
        count2 = seq_present
        if seq_present < threshold and query_count > 1:  # if below threhold  and more than 1 seq to blast
            # print("new taxon")
            # print(tax_id,query_count)
            # print(filteredScrape.sp_seq_d[tax_id].keys())
            blast_seq_id = filteredScrape.sp_seq_d[tax_id].keys()[0]
            seq = filteredScrape.sp_seq_d[tax_id][blast_seq_id]
            local_blast.write_filterblast_query(filteredScrape.workdir, blast_seq_id, seq,
                                                fn=tax_id)  # blast guy
            blast_db = filteredScrape.sp_seq_d[tax_id].keys()[1:]
            for blast_key in blast_db:
                seq = filteredScrape.sp_seq_d[tax_id][blast_key]
                local_blast.write_filterblast_db(filteredScrape.workdir, blast_key, seq, fn=tax_id)
            # make local blast of sequences
            local_blast.run_filter_blast(filteredScrape.workdir, tax_id, tax_id)
            seq_blast_score = local_blast.read_filter_blast(filteredScrape.workdir, seq_d, fn)
               
            if len(seq_blast_score.keys()) < (
                threshold - count2):  # less seq available than need to be added, just use all
                # print("add all")
                # print(len(seq_blast_score.keys()))
                thres_minus = (threshold - count2) - len(seq_blast_score.keys())
                # random_seq_ofsp = seq_blast_score
                # print("thresminus:", thres_minus)
                # print("query_count:", query_count)
                # print(threshold - count2)
                count = count - thres_minus

    filteredScrape.how_many_sp_to_keep(selectby)
    # print(count, len(filteredScrape.filtered_seq))
    assert count == len(filteredScrape.filtered_seq) and count>0
  
# #added before
# #[429489224, 429489233, 429489188]
# {'^ncbi:taxon': 1268591, '^ncbi:title': 'Senecio scopolii subsp. scopolii clone JC4715-6 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_scopolii_subsp._scopolii', '^physcraper:status': 'query', '^ot:ottId': 114544, '^ncbi:accession': 'JX895389.1', '^ncbi:gi': 429489224, '^physcraper:last_blasted': '1800/01/01'}
# {'^ncbi:taxon': 1268591, '^ncbi:title': 'Senecio scopolii subsp. scopolii clone JC4715-15 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_scopolii_subsp._scopolii', '^physcraper:status': 'query', '^ot:ottId': 114544, '^ncbi:accession': 'JX895398.1', '^ncbi:gi': 429489233, '^physcraper:last_blasted': '1800/01/01'}
# {'^ncbi:taxon': 1268580, '^ncbi:title': 'Senecio lagascanus clone JC5600-6 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_lagascanus', '^physcraper:status': 'query', '^ot:ottId': 640718, '^ncbi:accession': 'JX895353.1', '^ncbi:gi': 429489188, '^physcraper:last_blasted': '1800/01/01'}


# [u'JX895398.1', u'JX895353.1', u'JX895392.1', 'JX895513.1', 'JX895264.1']

# ## now only one scopolii
# 1268590: [{'^ncbi:taxon': 1268590, '^ncbi:title': 'Senecio scopolii subsp. floccosus 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_scopolii_subsp._floccosus', '^physcraper:status': 'query', '^ot:ottId': 114541, '^ncbi:accession': 'JX895513.1', '^ncbi:gi': 429489348, '^physcraper:last_blasted': '1800/01/01'}],
# 1268581: {'^ncbi:taxon': 1268581, '^ncbi:title': 'Senecio lopezii clone JC3604-12 18S ribosomal RNA gene, partial sequence; internal transcribed spacer 1, 5.8S ribosomal RNA gene, and internal transcribed spacer 2, complete sequence; and 28S ribosomal RNA gene, partial sequence', '^ot:ottTaxonName': 'Senecio_lopezii', '^physcraper:status': 'query', '^ot:ottId': 688688, '^ncbi:accession': 'JX895264.1', '^ncbi:gi': 429489099, '^physcraper:last_blasted': '1800/01/01'}
Ejemplo n.º 12
0

tre.write(path="before2.tre", schema="nexus")

data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln,
                                         workdir=workdir,
                                         config_obj=conf,
                                         study_id=study_id,
                                         tree_id=tree_id)

data_obj.write_files()
json.dump(data_obj.otu_dict, open('{}/otu_dict.json'.format(workdir), 'wb'))

sys.stdout.write("{} taxa in alignement and tree\n".format(len(data_obj.aln)))

ids = physcraper.IdDicts(conf, workdir='treebase')

scraper = physcraper.PhyscraperScrape(data_obj, ids)
#scraper.read_blast_wrapper()
scraper.est_full_tree()

'''scraper.run_blast_wrapper()
scraper.read_blast_wrapper()
scraper.remove_identical_seqs()
scraper.write_all_unaligned(filename="combo.fas")

json.dump(data_obj.otu_dict, open('treebase/otu_dict2.json', 'wb'))


scraper.generate_streamed_alignment()