Exemple #1
0
 def make_alns_dict(self):
     """Makes dendropy aln out of dict self.comb_seq for all genes.
     """
     physcraper.debug("make_alns_dict")
     firstelement = True
     count = 0
     for gene in self.comb_seq.keys():
         if count == 0:
             len1 = len(self.comb_seq[gene].keys())
             len2 = len1
             count = 1
         else:
             len2 = len(self.comb_seq[gene].keys())
         assert len1 == len2
     for gene in self.comb_seq.keys():
         if firstelement:
             aln1 = DnaCharacterMatrix.from_dict(self.comb_seq[gene])
             firstelement = False
             self.aln_all[count] = aln1
             aln1.write(path="{}/aln_0.fas".format(self.workdir),
                        schema="fasta")
         else:
             aln = DnaCharacterMatrix.from_dict(
                 self.comb_seq[gene], taxon_namespace=aln1.taxon_namespace)
             self.aln_all[count] = aln
             aln.write(path="{}/aln_{}.fas".format(self.workdir, count),
                       schema="fasta")
         count += 1
def create_sub_files(
    alignment_file,
    dates_file,
    subtree_file,
    subtree_dates_file,
    subfasta_file,
    new_dates_file,
):
    dates_dic = read_dates(dates_file)

    # clean up comments and add dates to end of taxon names
    with open(subtree_file, "r") as fp:
        content = fp.read().replace("None", "")
        content = re.sub("NODE_\d+", "", content)
        for taxon, date in dates_dic.items():
            content = content.replace(taxon, taxon + "_" + date)

    with open(subtree_dates_file, "w") as fp:
        fp.write(content)

    # add dates to end of sequence names
    sub_aln_dic = {}
    dna = DnaCharacterMatrix.get(path=alignment_file, schema="fasta")

    for taxon, date in dates_dic.items():
        t = dna.taxon_namespace.get_taxon(label=taxon)
        new_taxon_name = taxon + "_" + date
        sub_aln_dic[new_taxon_name] = str(dna[t])
    sub_dna = DnaCharacterMatrix.from_dict(sub_aln_dic)
    sub_dna.write(path=subfasta_file, schema="fasta")

    with open(new_dates_file, "w") as fp:
        fp.write(str(len(dates_dic)))
        for taxon, date in dates_dic.items():
            fp.write("\n" + taxon + "_" + date + "\t" + date)
Exemple #3
0
def generate_ATT_from_files(seqaln,
                            mattype,
                            workdir,
                            treefile,
                            otu_json,
                            ingroup_mrca=None):
    """Build an ATT object without phylesystem.
    If no ingroup mrca ott_id is provided, will use all taxa in tree to calc mrca."""
    aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
    for tax in aln.taxon_namespace:
        tax.label = tax.label.replace(" ", "_") #Forcing all spaces to underscore UGH
    tre = Tree.get(path=treefile,
                   schema="newick",
                   preserve_underscores=True,
                   taxon_namespace=aln.taxon_namespace)
    with open(otu_json) as data_file:
        otu_dict = json.load(data_file)
    for tax in aln:
        assert tax.label in otu_dict
    tre = Tree.get(path=treefile,
                   schema="newick",
                   preserve_underscores=True,
                   taxon_namespace=aln.taxon_namespace)
    otu_newick = tre.as_string(schema="newick")
    if ingroup_mrca:
        ott_mrca = int(ingroup_mrca)
    else:
        ott_ids = [otu_dict[otu].get['^ot:ottId'] for otu in otu_dict]
        ott_mrca = get_mrca_ott(ott_ids)
    return AlignTreeTax(otu_newick, otu_dict, aln, ingroup_mrca=ott_mrca, workdir=workdir)
def getNucleotides(jsonname, fstname):
  with open(jsonname+".json") as json:
    taxa = load(json)
  nucleotides = DnaCharacterMatrix.get_from_path("../work/"+ fstname+".fst", schema="fasta")
  for taxon in nucleotides.taxon_set:
    for accession, metadata in taxa.items():
      if accession == taxon.label.split(".")[0]:
        taxon.label = metadata["name"].replace(" ", "_")
  return nucleotides
def prepare_phylotorch(
    subfasta_file,
    subtree_file,
    dates_file,
    json_template_file,
    json_file,
    iterations,
    bito,
):
    dates = read_dates(dates_file)
    taxa = []
    datess = list(map(float, dates.values()))
    root_shift = max(datess) - min(datess)
    for taxon, date in dates.items():
        taxa.append({
            "id": "{}".format(taxon, date),
            "type": "Taxon",
            "attributes": {
                "date": float(date)
            },
        })

    with open(json_template_file, "r") as fp:
        content = fp.read()

    content = (content.replace("TAXA_TEMPLATE", json.dumps(taxa)).replace(
        "ITERATION_TEMPLATE",
        iterations).replace("ROOT_SHIFT_TEMPLATE",
                            str(root_shift)).replace("DIM_TEMPLATE",
                                                     str(len(datess) - 2)))
    if bito.lower() == "true":
        content = (content.replace(
            "SEQUENCES_TEMPLATE", '"' + subfasta_file + '"').replace(
                "TREE_TEMPLATE", '"' + subtree_file + '"').replace(
                    '"newick"', '"file"').replace('"sequences"', '"file"'))
    else:
        with open(subtree_file, "r") as fp:
            newick = fp.read().strip()
        alignment = DnaCharacterMatrix.get(path=subfasta_file, schema="fasta")
        sequences = []
        for name in alignment:
            sequences.append({
                "taxon": str(name).strip("'"),
                "sequence": str(alignment[name])
            })

        content = content.replace("SEQUENCES_TEMPLATE",
                                  json.dumps(sequences)).replace(
                                      "TREE_TEMPLATE", '"' + newick + '"')

    with open(json_file, "w") as fp:
        fp.write(content)
Exemple #6
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi):
    if os.path.isfile("{}/scrape.p".format(workdir)): 
        sys.stdout.write("Readloading from pickled scrapefile")
        scraper = pickle.load(open("{}/scrape.p".format(workdir),'rb'))
        scraper.repeat = 1
    else: 
            sys.stdout.write("setting up Data Object\n")
            sys.stdout.flush()
            #read the config file into a configuration object
            conf = ConfigObj(configfi)
            aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)

            #Generate an linked Alignment-Tree-Taxa object
            data_obj = generate_ATT_from_phylesystem(aln=aln,
                                 workdir=workdir,
                                 study_id = study_id,
                                 tree_id = tree_id,
                                 phylesystem_loc = conf.phylesystem_loc)




            #Prune sequnces below a certain length threshold
            #This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
            data_obj.prune_short()

            data_obj.write_files()
            data_obj.write_labelled()


            #Mapping identifiers between OpenTree and NCBI requires and identifier dict object
            ids = IdDicts(conf, workdir="example")


            #Now combine the data, the ids, and the configuration into a single physcraper scrape object
            scraper =  PhyscraperScrape(data_obj, ids, conf)
            #run the ananlyses
            scraper.run_blast()
            scraper.read_blast()
            scraper.remove_identical_seqs()
            scraper.generate_streamed_alignment()
    while scraper.repeat == 1: 
        scraper.run_blast()
        scraper.read_blast()
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
Exemple #7
0
def generate_ATT_from_files(seqaln,
                            mattype,
                            workdir,
                            config_obj,
                            treefile,
                            otu_json,
                            schema_trf,
                            ingroup_mrca=None):
    """Build an ATT object without phylesystem, use your own files instead.

    Spaces vs underscores kept being an issue, so all spaces are coerced to underscores when data are read in.

    Note: has test -> test_owndata.py

    :param seqaln: path to sequence alignment
    :param mattype: string containing format of sequence alignment
    :param workdir: path to working directory
    :param config_obj: config class including the settings
    :param treefile: path to phylogeny
    :param otu_json: path to json file containing the translation of tip names to taxon names, generated with OtuJsonDict()
    :param schema_trf: string defining the format of the input phylogeny
    :param ingroup_mrca: optional - OToL ID of the mrca of the clade of interest. If no ingroup mrca ott_id is provided, will use all taxa in tree to calc mrca.

    :return: object of class ATT
    """

    # replace ? in seqaln with - : papara handles them as different characters

    if not os.path.exists(workdir):
        os.makedirs(workdir)
    # use replaced aln as input
    aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
    assert aln.taxon_namespace
    for tax in aln.taxon_namespace:
        tax.label = tax.label.replace(" ", "_")  # Forcing all spaces to underscore
    tre = Tree.get(path=treefile,
                   schema=schema_trf,
                   preserve_underscores=True,
                   taxon_namespace=aln.taxon_namespace)
    assert tre.taxon_namespace is aln.taxon_namespace, "tre and aln have not the same namespace."
    otu_newick = tre.as_string(schema=schema_trf)
    otu_dict = json.load(open(otu_json, "r"))
    if ingroup_mrca:
        mrca_ott = int(ingroup_mrca)
    else:
        ott_ids = [otu_dict[otu].get(u'^ot:ottId', ) for otu in otu_dict]
        ott_ids = filter(None, ott_ids)
        ott_ids = set(ott_ids)
        mrca_ott = get_mrca_ott(ott_ids)
    return AlignTreeTax(otu_newick, otu_dict, aln, ingroup_mrca=mrca_ott, workdir=workdir,
                        config_obj=config_obj, schema=schema_trf)
Exemple #8
0
def test_0():
    if os.path.isfile("tests/data/precooked/otol_scraper.p"):
        # physcraper.debug(os.getcwd())
        conf = physcraper.ConfigObj(configfi, interactive=False)
        # physcraper.debug("conf")
        conf.unmapped = 'keep'
        # physcraper.debug("set unmapped")
        data_obj = pickle.load(
            open("tests/data/precooked/otol_tiny_dataobj.p", 'rb'))
        data_obj.workdir = absworkdir
        # physcraper.debug("dataobj loaded")
        ids = physcraper.IdDicts(conf, workdir=data_obj.workdir)
        ids.acc_ncbi_dict = pickle.load(
            open("tests/data/precooked/otol_tiny_gi_map.p", "rb"))
        # physcraper.debug("ids loaded")
        scraper = pickle.load(open("tests/data/precooked/otol_scraper.p",
                                   "rb"))
        # physcraper.debug("scraper loaded")
        # scraper2 = pickle.load(open("tests/data/precooked/otol_scraper.p", "rb"))
        num_keep = len(scraper.data.aln.taxon_namespace)
        # physcraper.debug('num_keep')

        # physcraper.debug(num_keep)
    # except:
    else:
        sys.stdout.write("\n\n No files present\n\n")
        conf = physcraper.ConfigObj(configfi)
        conf.unmapped = 'keep'
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        data_obj = physcraper.generate_ATT_from_phylesystem(
            aln=aln,
            workdir=workdir,
            study_id=study_id,
            tree_id=tree_id,
            phylesystem_loc=conf.phylesystem_loc)
        # physcraper.debug(len(data_obj.aln.taxon_namespace))
        pickle.dump(data_obj,
                    open("tests/data/precooked/otol_tiny_dataobj.p", "wb"))
        ids = physcraper.IdDicts(conf, workdir=workdir)
        # physcraper.debug(os.getcwd())
        pickle.dump(ids.acc_ncbi_dict,
                    open("tests/data/precooked/otol_tiny_gi_map.p", "wb"))
        data_obj.write_files()
        scraper = physcraper.PhyscraperScrape(data_obj, ids)
        # physcraper.debug(len(scraper.data.aln.taxon_namespace))
        # physcraper.debug("scraper obj made")
        pickle.dump(scraper.config,
                    open("tests/data/precooked/otol_conf.p", "wb"))
        pickle.dump(scraper, open("tests/data/precooked/otol_scraper.p", "wb"))
        num_keep = len(scraper.data.aln.taxon_namespace)
Exemple #9
0
def read_tree_and_alignment(tree, alignment, dated=True, heterochornous=True):
    tree = read_tree(tree, dated, heterochornous)

    # alignment
    seqs_args = dict(schema='nexus', preserve_underscores=True)
    with open(alignment) as fp:
        if next(fp).startswith('>'):
            seqs_args = dict(schema='fasta')
    dna = DnaCharacterMatrix.get(path=alignment,
                                 taxon_namespace=tree.taxon_namespace,
                                 **seqs_args)
    sequence_count = len(dna)
    if sequence_count != len(dna.taxon_namespace):
        sys.stderr.write('taxon names in trees and alignment are different')
        exit(2)
    return tree, dna
Exemple #10
0
def load_otol_data(conf, ingroup_mrca, mattype, seqaln, study_id, tree_id,
                   workdir):
    """
    Generates ATT object from OToL data.

    :param conf: conf object from physcraper
    :param ingroup_mrca: mrca of ingroup as OTT ID
    :param mattype: alignment matrix type
    :param seqaln: alignment file name
    :param study_id: OToL study ID
    :param tree_id: OToL tree ID
    :param workdir: working directory
    :return: ATT object
    """
    if os.path.isfile("{}/att_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading data object from pickle file\n")
        data_obj = pickle.load(
            open("{}/att_checkpoint.p".format(workdir), "rb"))
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()

        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_phylesystem(
            aln=aln,
            workdir=workdir,
            config_obj=conf,
            study_id=study_id,
            tree_id=tree_id,
            phylesystem_loc=conf.phylesystem_loc,
            ingroup_mrca=ingroup_mrca)
        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated,
        # as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
    assert isinstance(data_obj, AlignTreeTax)
    return data_obj
Exemple #11
0
 def concatenate_alns(self):
     """Concatenate all alns into one aln.
     """
     physcraper.debug("concat alns")
     count = 0
     for gene in self.aln_all:
         if count == 0:
             aln1 = self.aln_all[gene]
             aln1.write(path="{}/aln1.fas".format(self.workdir),
                        schema="fasta")
             count = 1
         else:
             aln2 = self.aln_all[gene]
             count += 1
             aln2.write(path="{}/aln{}.fas".format(self.workdir, count),
                        schema="fasta")
             assert aln1.taxon_namespace == aln2.taxon_namespace
             aln1 = DnaCharacterMatrix.concatenate([aln1, aln2])
     aln1.write(path="{}/concat.fas".format(self.workdir), schema="fasta")
     self.concatenated_aln = aln1
Exemple #12
0
def test_generate_ATT_from_phylesystem():
    seqaln = "tests/data/input.fas"
    study_id = "pg_873"
    tree_id = "tree1679"
    seqaln = "tests/data/minitest.fas"
    mattype = "fasta"
    workdir = "tests/output/opentree"
    configfi = "tests/data/remotencbi.config"

    sys.stdout.write("\nTesting 'generate_ATT_from_files (fromfile.py)'\n")

    conf = physcraper.ConfigObj(configfi, interactive=False)
    aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)

    data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln,
                                                        workdir=workdir,
                                                        config_obj=conf,
                                                        study_id=study_id,
                                                        tree_id=tree_id)

    data_obj == True
Exemple #13
0
 def write_labelled(self, label='^ot:ottTaxonName', treepath="labelled.tre", alnpath="labelled.fas"):
     """output tree and alignement with human readble labels
     Jumps through abunch of hoops to make labels unique.
     NOT MEMORY EFFICIENT AT ALL"""
     assert label in ['^ot:ottTaxonName', "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
     tmp_newick = self.tre.as_string(schema="newick")
     tmp_tre = Tree.get(data=tmp_newick,
                        schema="newick",
                        preserve_underscores=True)
     tmp_fasta = self.aln.as_string(schema="fasta")
     tmp_aln = DnaCharacterMatrix.get(data=tmp_fasta,
                                      schema="fasta",
                                      taxon_namespace=tmp_tre.taxon_namespace)
     new_names = set()
     for taxon in tmp_tre.taxon_namespace:
         new_label = self.otu_dict[taxon.label].get(label)
         if new_label:
             if new_label in new_names:
                 new_label = " ".join([new_label, taxon.label])
             new_names.add(new_label)
             taxon.label = new_label
         elif self.otu_dict[taxon.label].get("^ot:originalLabel"):
             new_label = self.otu_dict[taxon.label].get("^ot:originalLabel")
             if new_label in new_names:
                 new_label = " ".join([new_label, taxon.label])
             new_names.add(new_label)
             taxon.label = new_label
         elif self.otu_dict[taxon.label].get("^ncbi:taxon"):
             new_label = " ".join(["ncbi", str(self.otu_dict[taxon.label].get("^ncbi:taxon"))])
             if new_label in new_names:
                 new_label = " ".join([new_label, taxon.label])
             new_names.add(new_label)
             taxon.label = new_label
     tmp_tre.write(path="{}/{}".format(self.workdir, treepath),
                   schema="newick",
                   unquoted_underscores=True,
                   suppress_edge_lengths=False)
     tmp_aln.write(path="{}/{}".format(self.workdir, alnpath),
                   schema="fasta")
Exemple #14
0
def test_opentree():
    # Use OpenTree phylesystem identifiers to get study and tree
    study_id = "pg_873"
    tree_id = "tree1679"
    seqaln = "tests/data/minitest.fas"
    mattype = "fasta"
    workdir = "tests/output/opentree"
    configfi = "tests/data/remotencbi.config"

    sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n")
    conf = physcraper.ConfigObj(configfi, interactive=False)
    # print "1. {}".format(conf.email)

    aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
    data_obj = physcraper.generate_ATT_from_phylesystem(
        aln=aln,
        workdir=workdir,
        config_obj=conf,
        study_id=study_id,
        tree_id=tree_id,
        phylesystem_loc=conf.phylesystem_loc)
    assert isinstance(data_obj, AlignTreeTax)
Exemple #15
0
 def align_query_seqs(self, papara_runname="extended"):
     """runs papara on the tree, the alinment and the new query sequences"""
     if not self._query_seqs_written:
         self.write_query_seqs()
     for filename in glob.glob('{}/papara*'.format(self.workdir)):
             os.rename(filename, "{}/{}_tmp".format(self.workdir, filename.split("/")[1]))
     sys.stdout.write("aligning query sequences \n")
     self.data.write_papara_files()
     os.chdir(self.workdir)#Clean up dir moving
     pp = subprocess.call(["papara",
                           "-t", "random_resolve.tre",
                           "-s", "aln_ott.phy",
                           "-q", self.newseqs_file,
                           "-n", papara_runname]) #FIx directory ugliness
     sys.stdout.write("Papara done")
     os.chdir('..')
     assert os.path.exists(path="{}/papara_alignment.{}".format(self.workdir, papara_runname))
     self.data.aln = DnaCharacterMatrix.get(path="{}/papara_alignment.{}".format(self.workdir, papara_runname), schema="phylip")
     self.data.aln.taxon_namespace.is_mutable = False #This should enforce name matching throughout...
     sys.stdout.write("Papara done")
     with open(self.logfile, "a") as log:
         log.write("Following papara alignement, aln has {} seqs \n".format(len(self.data.aln)))
     self.data.reconcile()
     self._query_seqs_aligned = 1
ncbi_to_ott = {}
fi =open(ott_ncbi)

#pickle meeeee
for lin in fi:
    lii= lin.split(",")
    ncbi_to_ott[int(lii[1])]=int(lii[0])

gi_ncbi_map = {}
if os.path.isfile("id_map.txt"):
    fi = open("id_map.txt")
    for lin in fi:
        gi_ncbi_map[int(lin.split(",")[0])]=lin.split(",")[1]


orig_seq = DnaCharacterMatrix.get(path="accs",schema="fasta")

#prune out identical sequences

mapped_taxon_ids=open("id_map.txt","a")
stops = []
for taxon, seq in orig_seq.items():
    gi = int(taxon.label.split('|')[1])
    if gi in gi_ncbi_map.keys():
        try:
            taxon.label = ncbi_to_ott[int(gi_ncbi_map[gi])]
        except:
            taxon.label = "ncbi_id_{}".format(int(gi_ncbi_map[gi]))
    else:
        try:
            ncbi_id = int(subprocess.check_output(["bash", get_ncbi_taxonomy, "{}".format(gi), "{}".format(ncbi_dmp)]).split('\t')[1])
Exemple #17
0
def run(arg):
    taxa = dendropy.TaxonNamespace()

    tree_format = 'newick'
    with open(arg.tree.name) as fp:
        if next(fp).upper().startswith('#NEXUS'):
            tree_format = 'nexus'

    tree = Tree.get(
        file=arg.tree,
        schema=tree_format,
        tree_offset=0,
        taxon_namespace=taxa,
        preserve_underscores=True,
        rooting='force-rooted',
    )

    tree.resolve_polytomies(update_bipartitions=True)

    utils.setup_indexes(tree)

    oldest = utils.setup_dates(tree, arg.dates, arg.heterochronous)

    peeling = utils.get_peeling_order(tree)
    sequence_count = len(tree.taxon_namespace)
    data = {'peel': peeling, 'S': sequence_count}

    if arg.input:
        seqs_args = dict(schema='nexus', preserve_underscores=True)
        with open(arg.input.name) as fp:
            if next(fp).startswith('>'):
                seqs_args = dict(schema='fasta')

        dna = DnaCharacterMatrix.get(file=arg.input,
                                     taxon_namespace=taxa,
                                     **seqs_args)
        alignment_length = dna.sequence_size
        sequence_count = len(dna)
        if sequence_count != len(dna.taxon_namespace):
            sys.stderr.write(
                'taxon names in trees and alignment are different')
            exit(2)

        print('Number of sequences: {} length {} '.format(
            sequence_count, alignment_length))
        print('Model: ' + arg.model)

        tipdata, weights = utils.get_dna_leaves_partials_compressed(dna)
        alignment_length = len(weights)

        data.update({
            'tipdata': tipdata,
            'L': alignment_length,
            'weights': weights
        })

    if arg.metadata:
        # Parse metadata file
        with open(arg.metadata) as fp:
            geodata = {}
            countries = {}
            geopattern = []
            header = next(fp).strip().split('\t')
            index_country = header.index(arg.metadata_key)

            for line in fp:
                row = line.strip().split('\t')
                if len(row) > 0:
                    geodata[row[0]] = row[index_country]
                    countries[row[index_country]] = 1

        country_to_index = {}
        index_to_country = []
        for idx, taxon in enumerate(tree.taxon_namespace):
            country = geodata[taxon.label]
            if country not in country_to_index:
                country_to_index[country] = len(country_to_index)
                index_to_country.append(country)

        print('"' + '","'.join(index_to_country) + '"')

        state_count = len(country_to_index)

        for idx, taxon in enumerate(tree.taxon_namespace):
            pattern = [0] * state_count
            country = geodata[taxon.label]
            pattern[country_to_index[country]] = 1
            geopattern.append(pattern)

        blens = [None] * (sequence_count * 2 - 1)
        for node in tree.postorder_node_iter():
            blens[node.index - 1] = node.edge.length
            if node.edge.length < 0:
                exit(3)

        children = tree.seed_node.child_nodes()
        blens[children[0].index] += blens[children[1].index]
        blens = blens[:-2]  # discard root branch and one of its child

        data['STATES'] = state_count
        data['blens'] = blens
        data['frequencies_alpha_geo'] = [1] * state_count
        data['rates_alpha_geo'] = [1] * int(state_count *
                                            (state_count - 1) / 2)
        data['geodata'] = geopattern

    if arg.clock is not None:
        data['map'] = utils.get_preorder(tree)
        if not arg.estimate_rate:
            data['rate'] = arg.rate if arg.rate else 1.0
        if arg.heterochronous:
            data['lowers'] = utils.get_lowers(tree)
            data['lower_root'] = max(oldest, arg.lower_root)
        else:
            data['lower_root'] = arg.lower_root
    else:
        last = peeling[-1]
        if last[0] > last[1]:
            peeling[-1] = [last[1], last[0], last[2]]

    if arg.categories > 1:
        data['C'] = arg.categories
        if arg.invariant:
            data['C'] += 1

    if arg.clock is not None:
        if arg.coalescent == 'skygrid':
            data['G'] = arg.grid - 1
            data['grid'] = np.linspace(0, arg.cutoff, arg.grid)[1:]
        elif arg.coalescent == 'skyride':
            # number of coalescent intervals
            data['I'] = sequence_count - 1

    if arg.model == 'GTR':
        data['frequencies_alpha'] = [1, 1, 1, 1]
        data['rates_alpha'] = [1, 1, 1, 1, 1, 1]
    elif arg.model == 'HKY':
        data['frequencies_alpha'] = [1, 1, 1, 1]

        # Samples output file
    sample_path = arg.output
    tree_path = sample_path + '.trees'

    binary = arg.script.replace('.stan', '.pkl')
    if binary == arg.script:
        binary = arg.script + '.pkl'
    if not os.path.lexists(binary) or arg.compile:
        sm = pystan.StanModel(file=arg.script)
        with open(binary, 'wb') as f:
            pickle.dump(sm, f)
    else:
        sm = pickle.load(open(binary, 'rb'))

    stan_args = {
        'data': data,
        'iter': arg.iter,
        'sample_file': sample_path,
        'algorithm': arg.algorithm,
    }
    if hasattr(arg, 'seed'):
        stan_args['seed'] = arg.seed

    if arg.init is not None:
        inits = {}
        for line in arg.init:
            line = line.strip()
            if len(row) > 0:
                line = line.split(':')
                inits[line[0].strip()] = list(map(float, line[1].split(',')))
        stan_args['init'] = inits
    elif arg.heights_init or arg.rate is not None:
        inits = {}
        if arg.heights_init:
            ratios, root_height = utils.ratios_root_height_from_branch_lengths(
                tree)
            # ratios_unres = np.log(ratios / (1.0 - ratios))
            # root_height_unres = np.log(root_height - data['lower_root'])
            inits['props'] = ratios.tolist()  # ratios_unres.tolist()
            inits['height'] = root_height.item() - data['lower_root']
            inits['rate'] = arg.rate
        elif arg.rate is not None:
            inits['rate'] = arg.rate
        stan_args['init'] = inits

    if arg.algorithm == 'LBFGS':
        fit = sm.optimizing(**stan_args)
        print(fit)
    elif arg.algorithm == 'VB':
        stan_args['algorithm'] = arg.variational
        stan_args['output_samples'] = arg.samples
        if arg.eta:
            stan_args['eta'] = arg.eta
            stan_args['adapt_engaged'] = False

        fit = sm.vb(tol_rel_obj=arg.tol_rel_obj,
                    elbo_samples=arg.elbo_samples,
                    grad_samples=arg.grad_samples,
                    diagnostic_file=sample_path + ".diag",
                    **stan_args)

        # parse the log file
        utils.convert_samples_to_nexus(tree, sample_path, tree_path, arg.rate)
        utils.parse_log(sample_path, 0.05)
    else:
        fit = sm.sampling(chains=arg.chains, thin=arg.thin, **stan_args)

        # chain=1 pystan uses sample_file
        if arg.chains == 1:
            if sample_path.endswith('.csv'):
                tree_path = sample_path.replace('.csv', '.trees')
            utils.convert_samples_to_nexus(tree, sample_path, tree_path,
                                           arg.rate)
            utils.parse_log(sample_path, 0.05)
            # chain>1 pystan appends _{chain}.csv to sample_file
        else:
            for chain in range(arg.chains):
                sample_path_chain = sample_path + '_{}.csv'.format(chain)
                tree_path_chain = sample_path + '_{}.trees'.format(chain)
                utils.convert_samples_to_nexus(tree, sample_path_chain,
                                               tree_path_chain, arg.rate)
                utils.parse_log(sample_path_chain, 0.05)
Exemple #18
0
    def write_labelled(self, label, filename = "labelled", direc='workdir', norepeats=True, add_gb_id=False):
        """output tree and alignment with human readable labels
        Jumps through a bunch of hoops to make labels unique.

        NOT MEMORY EFFICIENT AT ALL

        Has different options available for different desired outputs

        :param label: which information shall be displayed in labelled files: possible options:
                    '^ot:ottTaxonName', '^user:TaxonName', "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"
        :param treepath: optional: full file name (including path) for phylogeny
        :param alnpath:  optional: full file name (including path) for alignment
        :param norepeats: optional: if there shall be no duplicate names in the labelled output files
        :param add_gb_id: optional, to supplement tiplabel with corresponding GenBank sequence identifier
        :return: writes out labelled phylogeny and alignment to file
        """
        #debug("write labelled files")
        if direc == 'workdir':
            direc = self.workdir
        treepath = "{}/{}".format(direc, "{}.tre".format(filename))
        alnpath = "{}/{}".format(direc, '{}.fas'.format(filename))
        debug(treepath)
        assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
                         "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
        tmp_newick = self.tre.as_string(schema="newick")
        tmp_tre = Tree.get(data=tmp_newick,
                           schema="newick",
                           preserve_underscores=True)
        tmp_fasta = self.aln.as_string(schema="fasta")
        tmp_aln = DnaCharacterMatrix.get(data=tmp_fasta,
                                         schema="fasta",
                                         taxon_namespace=tmp_tre.taxon_namespace)
        new_names = set()
        for taxon in tmp_tre.taxon_namespace:
            new_label = self.otu_dict[taxon.label].get(label, None)
            if new_label is None:
                if self.otu_dict[taxon.label].get("^ot:originalLabel"):
                    new_label = "orig_{}".format(self.otu_dict[taxon.label]["^ot:originalLabel"])
                else:
                    new_label = "ncbi_{}_ottname_{}".format(self.otu_dict[taxon.label].get("^ncbi:taxon", "unk"),
                                                            self.otu_dict[taxon.label].get('^physcraper:TaxonName', "unk"))
            new_label = str(new_label).replace(' ', '_')
            if add_gb_id:
                gb_id = self.otu_dict[taxon.label].get('^ncbi:accession')
                if gb_id is None:
                    gb_id = self.otu_dict[taxon.label].get("^ot:originalLabel")
                new_label = "_".join([new_label, str(gb_id)])
                sp_counter = 2
                if new_label in new_names and norepeats:
                    new_label = "_".join([new_label, str(sp_counter)])
                    sp_counter += 1
            else:
                if new_label in new_names and norepeats:
                    new_label = "_".join([new_label, taxon.label])
            taxon.label = new_label
            new_names.add(new_label)
        tmp_tre.write(path=treepath,
                      schema="newick",
                      unquoted_underscores=True,
                      suppress_edge_lengths=False)
        tmp_aln.write(path=alnpath,
                      schema="fasta")
Exemple #19
0
import sys
from dendropy import DnaCharacterMatrix
infi = sys.argv[1]
outstub = sys.argv[2]
start = int(sys.argv[3])
stop = int(sys.argv[4])

orig = DnaCharacterMatrix.get(path=infi, schema="nexus")

d = {}
for taxon, seq in orig.items():
    d[taxon.label] = seq.values()[start:stop]

dna = DnaCharacterMatrix.from_dict(d)

dna.write(path="{}.fas".format(outstub), schema="fasta")
Exemple #20
0
import argparse
from dendropy.calculate import popgenstat
from dendropy import DnaCharacterMatrix

if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description="find nucleotide diversity of a population",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        "--alignment",
        help="an aligned FASTA file to create a DNA character matrix from")
    parser.add_argument(
        "--output",
        help="outputting a txt file with the nucleotide_diversity value")

    args = parser.parse_args()

    d = DnaCharacterMatrix.get(path=args.alignment, schema="fasta")

    with open(args.output, 'w') as f:
        f.write(str(popgenstat.nucleotide_diversity(d, ignore_uncertain=True)))
Exemple #21
0
import sys
from dendropy import DnaCharacterMatrix
infi=sys.argv[1]
outstub=sys.argv[2]
start=int(sys.argv[3])
stop=int(sys.argv[4])

orig = DnaCharacterMatrix.get(path=infi, schema="nexus")

d = {}
for taxon, seq in orig.items():
	d[taxon.label] = seq.values()[start:stop]
  


dna = DnaCharacterMatrix.from_dict(d)

dna.write(path="{}.fas".format(outstub), schema="fasta")
Exemple #22
0
from dendropy import Tree, DnaCharacterMatrix
import sys

d = {}

query_seq = DnaCharacterMatrix.get(path="ascomycota.fasta", schema="fasta")


def seq_dict_build(seq, label, seq_dict):
    new_seq = seq.symbols_as_string().replace("-", "")
    for tax in seq_dict.keys():
        inc_seq = seq_dict[tax].symbols_as_string().replace("-", "")
        if len(inc_seq) > len(new_seq):
            if inc_seq.find(new_seq) != -1:
                sys.stdout.write(
                    "seq {} is subsequence of {}, not added\n".format(
                        label, tax))
                return
        else:
            if new_seq.find(inc_seq) != -1:
                del d[tax]
                d[label] = seq
                sys.stdout.write(
                    "seq {} is supersequence of {}, {} added and {} removed\n".
                    format(label, tax, label, tax))
                return
    print(".")
    d[label] = seq
    return

Exemple #23
0
#! /usr/bin/env python
from dendropy import DnaCharacterMatrix, Tree
import sys

mat = sys.argv[1]
mattype = sys.argv[2]
tre = sys.argv[3]
tretype = sys.argv[4]
nam = sys.arg[5]

mat = 'example.aln'
mattype = 'fasta'
tre = 'tree.tre'
tretype = 'newick'

d = DnaCharacterMatrix.get(path=mat, schema=mattype)
# make the taxon_namespace immutable, so the tree does not add
#   new labels...
d.taxon_namespace.is_mutable = False
tree = Tree.get(path=tre,
                schema=tretype,
                preserve_underscores=True,
                taxon_namespace=d.taxon_namespace)

# get all of the taxa associated with tips of the tree, and make sure that
#   they include all of the members of the data's taxon_namespace...
treed_taxa = [i.taxon for i in tree.leaf_nodes()]
if len(treed_taxa) != len(d.taxon_namespace):
    missing = [i.label for i in d.taxon_namespace if i not in treed_taxa]
    emf = 'Some of the taxa are not in the tree. Missing "{}"\n'
    em = emf.format('", "'.join(missing))
Exemple #24
0
def test_reconcile():
    #------------------------
    seqaln = "tests/data/tiny_test_example/test.fas"
    seqalnmiss = "tests/data/tiny_test_example/test_missingseq.fas"
    mattype = "fasta"
    treefile = "tests/data/tiny_test_example/test.tre"
    treefilemiss = "tests/data/tiny_test_example/test_missingtip.tre"
    schema_trf = "newick"
    workdir = "tests/output/owndata"
    configfi = "example.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "tests/data/tmp/owndata/otu_dict.json".format(workdir)

    conf = ConfigObj(configfi, interactive=False)

    data_obj = generate_ATT_from_files(seqaln=seqalnmiss,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefile,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for otu in data_obj.otu_dict:
        if data_obj.otu_dict[otu][u'^ot:originalLabel'] == '2029_doronicum':
            assert data_obj.otu_dict[otu][
                '^physcraper:status'] == "deleted in reconciliation"

    #----------------------------------------------------

    data_obj = generate_ATT_from_files(seqaln=seqaln,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefilemiss,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for otu in data_obj.otu_dict:
        if data_obj.otu_dict[otu][u'^ot:originalLabel'] == 'S_scopolii':
            assert data_obj.otu_dict[otu][
                '^physcraper:status'] == "deleted in reconciliation"

    #----------------------------------------------------

    aln = DnaCharacterMatrix.get(path=seqalnmiss, schema=mattype)

    assert aln.taxon_namespace
    for tax in aln.taxon_namespace:
        tax.label = tax.label.replace(
            " ", "_")  # Forcing all spaces to underscore UGH

    tre = Tree.get(path=treefile,
                   schema="newick",
                   preserve_underscores=True,
                   taxon_namespace=aln.taxon_namespace)

    assert aln.taxon_namespace == tre.taxon_namespace
    assert aln.taxon_namespace is tre.taxon_namespace

    treed_taxa = set()
    for leaf in tre.leaf_nodes():
        treed_taxa.add(leaf.taxon)
    aln_tax = set()
    for tax, seq in aln.items():
        aln_tax.add(tax)

    prune = treed_taxa ^ aln_tax

    assert len(prune) == 1
    assert list(prune)[0].label == '2029_doronicum'

    #----------------

    aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)

    assert aln.taxon_namespace
    for tax in aln.taxon_namespace:
        tax.label = tax.label.replace(
            " ", "_")  # Forcing all spaces to underscore UGH

    tre = Tree.get(path=treefilemiss,
                   schema="newick",
                   preserve_underscores=True,
                   taxon_namespace=aln.taxon_namespace)

    assert aln.taxon_namespace == tre.taxon_namespace
    assert aln.taxon_namespace is tre.taxon_namespace

    treed_taxa = set()
    for leaf in tre.leaf_nodes():
        treed_taxa.add(leaf.taxon)
    aln_tax = set()
    for tax, seq in aln.items():
        aln_tax.add(tax)

    prune = treed_taxa ^ aln_tax

    assert len(prune) == 1
    assert list(prune)[0].label == 'S_scopolii'

    # ----------------------------

    seqaln = "tests/data/tiny_test_example/test.fas"
    seqalnmiss = "tests/data/tiny_test_example/test_missingseq.fas"
    mattype = "fasta"
    treefile = "tests/data/tiny_test_example/test.tre"
    treefilemiss = "tests/data/tiny_test_example/test_missingtip.tre"
    schema_trf = "newick"
    workdir = "tests/output/owndata"
    configfi = "example.config"
    id_to_spn = r"tests/data/tiny_test_example/test_nicespl.csv"
    otu_jsonfi = "tests/data/tmp/owndata/otu_dict.json".format(workdir)

    data_obj = generate_ATT_from_files(seqaln=seqalnmiss,
                                       mattype=mattype,
                                       workdir=workdir,
                                       config_obj=conf,
                                       treefile=treefilemiss,
                                       schema_trf=schema_trf,
                                       otu_json=otu_jsonfi,
                                       ingroup_mrca=None)

    for otu in data_obj.otu_dict:
        if data_obj.otu_dict[otu][u'^ot:originalLabel'] == '2029_doronicum':
            assert data_obj.otu_dict[otu][
                '^physcraper:status'] == "deleted in reconciliation"

    for otu in data_obj.otu_dict:
        if data_obj.otu_dict[otu][u'^ot:originalLabel'] == 'S_scopolii':
            assert data_obj.otu_dict[otu][
                '^physcraper:status'] == "deleted in reconciliation"
Exemple #25
0
from dendropy import Tree, DnaCharacterMatrix
import sys


d = {}

query_seq = DnaCharacterMatrix.get(path="ascomycota.fasta",schema="fasta")

def seq_dict_build(seq, label, seq_dict):
    new_seq = seq.symbols_as_string().replace("-","")
    for tax in seq_dict.keys():
        inc_seq = seq_dict[tax].symbols_as_string().replace("-","")
        if len(inc_seq) > len(new_seq):
            if inc_seq.find(new_seq) != -1:
                sys.stdout.write("seq {} is subsequence of {}, not added\n".format(label, tax))
                return
        else:
            if new_seq.find(inc_seq) != -1:
                del d[tax]
                d[label] = seq
                sys.stdout.write("seq {} is supersequence of {}, {} added and {} removed\n".format(label, tax, label, tax))
                return
    print (".")
    d[label] = seq
    return


for taxon, seq in query_seq.items():
    if len(seq.values()) > 800:
        seq_dict_build(seq, taxon.label, d)
    else:
 def _reconcile_names(self):
     d = DnaCharacterMatrix.get(path=self.seqaln, schema=self.mattype)
     d.taxon_namespace.is_mutable = True
     "so here I need to be getting the original names off of the "
Exemple #27
0
utils.setup_indexes(tree)

oldest = utils.setup_dates(tree, _dates, _heterochronous)

peeling = utils.get_peeling_order(tree)
sequence_count = len(tree.taxon_namespace)
data = {'peel': peeling, 'S': sequence_count}

if _input:
    seqs_args = dict(schema='nexus', preserve_underscores=True)
    with open(_input) as fp:
        if next(fp).startswith('>'):
            seqs_args = dict(schema='fasta')

    dna = DnaCharacterMatrix.get(path=_input,
                                 taxon_namespace=taxa,
                                 **seqs_args)
    alignment_length = dna.sequence_size
    sequence_count = len(dna)
    if sequence_count != len(dna.taxon_namespace):
        sys.stderr.write('taxon names in trees and alignment are different')
        exit(2)

    print('Number of sequences: {} length {} '.format(sequence_count,
                                                      alignment_length))
    print('Model: ' + _model)

    tipdata, weights = utils.get_dna_leaves_partials_compressed(dna)
    alignment_length = len(weights)

    data.update({
Exemple #28
0
#!/usr/bin/env python
from dendropy import DnaCharacterMatrix, Tree
import sys

mat=sys.argv[1]
mattype=sys.argv[2]
tre=sys.argv[3]
tretype=sys.argv[4]
nam=sys.arg[5]

mat = 'example.aln'
mattype = 'fasta'
tre = 'tree.tre'
tretype = 'newick'

d = DnaCharacterMatrix.get(path=mat,
                           schema=mattype)
# make the taxon_namespace immutable, so the tree does not add
#   new labels...
d.taxon_namespace.is_mutable = False
tree = Tree.get(path=tre,
                schema=tretype,
                preserve_underscores=True,
                taxon_namespace=d.taxon_namespace)

# get all of the taxa associated with tips of the tree, and make sure that
#   they include all of the members of the data's taxon_namespace...
treed_taxa = [i.taxon for i in tree.leaf_nodes()]
if len(treed_taxa) != len(d.taxon_namespace):
    missing = [i.label for i in d.taxon_namespace if i not in treed_taxa]
    emf = 'Some of the taxa are not in the tree. Missing "{}"\n'
    em = emf.format('", "'.join(missing))
Exemple #29
0
                    required=False,
                    type=int,
                    help="""Parameters for Stan script""")
arg = parser.parse_args()

my_path = os.path.split(os.path.realpath(__file__))[0]

taxa = dendropy.TaxonNamespace()

trees = dendropy.TreeList.get(file=arg.tree,
                              schema="newick",
                              preserve_underscores=True,
                              tree_offset=0,
                              taxon_namespace=taxa)

dna = DnaCharacterMatrix.get(file=arg.input, schema="fasta")
alignment_length = dna.sequence_size
sequence_count = len(dna)

print('Number of sequences: {} length {} '.format(sequence_count,
                                                  alignment_length))
print('Model: ' + arg.model)

tipdata, weights = phylo.get_dna_leaves_partials_compressed(dna)
alignment_length = len(weights)

for t in trees:
    t.encode_bipartitions(collapse_unrooted_basal_bifurcation=False)

count = 1
bip = {}
Exemple #30
0
def standard_run(study_id,
                 tree_id,
                 seqaln,
                 mattype,
                 workdir,
                 configfi,
                 ingroup_mrca=None,
                 shared_blast_folder=None):
    """looks for a json file to continue run, or builds and runs
    new analysis for as long as new seqs are found

    This is the wrapper function to start a PhyScraper run with tree and alignment ids from Open Tree of Life.
    You need:
         seqaln = ID of alignment file
         mattype = the format name of you alignment
         trfn = Id of phylogeny to update
         workdir = define where your analysis files shall be stored
         configfi = path to your config file
         ingroup_mrca = define the mrca, by supplying the Open Tree of Life identifier of the clade of interest

         shared_blast_folder = not necessary, if you want to share blast searches across runs (see documentation),
                                give the path to the folder with the shared runs.
    """
    debug("Debugging mode is on")

    conf = ConfigObj(configfi, interactive=False)
    if os.path.isfile("{}/att_checkpoint.p".format(workdir)):
        sys.stdout.write("Reloading data object from pickle file\n")
        data_obj = pickle.load(open("{}/att_checkpoint.p".format(workdir), "rb"))
#        scraper.repeat = 1
    else:
        sys.stdout.write("setting up Data Object\n")
        sys.stdout.flush()
        # read the config file into a configuration object
        conf = ConfigObj(configfi, interactive=False)
        aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
        # Generate an linked Alignment-Tree-Taxa object
        data_obj = generate_ATT_from_phylesystem(aln=aln,
                                                 workdir=workdir,
                                                 study_id=study_id,
                                                 tree_id=tree_id,
                                                 phylesystem_loc=conf.phylesystem_loc,
                                                 ingroup_mrca=ingroup_mrca)
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
        # ids = IdDicts(conf, workdir="example")
        # Prune sequences below a certain length threshold
        # This is particularly important when using loci that have been de-concatenated, as some are 0 length which causes problems.
        data_obj.prune_short()
        data_obj.write_files()
        data_obj.write_labelled(label="^ot:ottTaxonName")
        data_obj.write_otus("otu_info", schema="table")
        data_obj.dump()
        # Mapping identifiers between OpenTree and NCBI requires and identifier dict object
    if os.path.isfile(conf.id_pickle):
        sys.stdout.write("Reloading id dicts from {}\n".format(conf.id_pickle))
        ids = pickle.load(open(conf.id_pickle, "rb"))
    else:
        sys.stdout.write("setting up id dictionaries\n")
        sys.stdout.flush()
        ids = IdDicts(conf, workdir=workdir)
        ids.dump()
    # Now combine the data, the ids, and the configuration into a single physcraper scrape object
    scraper = PhyscraperScrape(data_obj, ids)
    # run the analyses
    if shared_blast_folder:
        scraper.blast_subdir = shared_blast_folder
    else:
        shared_blast_folder = None
    scraper.run_blast_wrapper(delay=14)
    scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
    scraper.remove_identical_seqs()
    scraper.generate_streamed_alignment()
    while scraper.repeat == 1:
        scraper.data.write_labelled(label="^ot:ottTaxonName")
        scraper.data.write_otus("otu_info", schema="table")
        if shared_blast_folder:
            scraper.blast_subdir = shared_blast_folder
        else:
            shared_blast_folder = None
        scraper.run_blast_wrapper(delay=14)
        scraper.read_blast_wrapper(blast_dir=shared_blast_folder)
        scraper.remove_identical_seqs()
        scraper.generate_streamed_alignment()
    # scraper.write_otu_info()

    return scraper
Exemple #31
0
import numpy as np
import numpy.linalg as la
from dendropy import Tree, DnaCharacterMatrix
import myPhylo

tree_path = '/home/nehleh/Documents/0_Research/PhD/Data/simulationdata/recombination/exampledataset/exampledataset_RAxML_bestTree'
tree = Tree.get_from_path(tree_path, 'newick')
alignment = DnaCharacterMatrix.get(file=open(
    "/home/nehleh/Documents/0_Research/PhD/Data/simulationdata/recombination/exampledataset/wholegenome.fasta"
),
                                   schema="fasta")

tree2 = Tree.get_from_path(
    '/home/nehleh/Documents/0_Research/PhD/Data/simulationdata/recombination/exampledataset/RerootTree_node12',
    'newick')

pi = [0.317, 0.183, 0.367, 0.133]
rates = [0.000100, 0.636612, 2.547706, 0.000100, 2.151395]
GTR_sample = myPhylo.GTR_model(rates, pi)

column = myPhylo.get_DNA_fromAlignment(alignment)
dna = column[0]
myPhylo.set_index(tree, dna)

print("Original tree:::::::::::::::")
print(tree.as_string(schema='newick'))
print(tree.as_ascii_plot())

LL_normal = myPhylo.computelikelihood(tree, dna, GTR_sample)
W_LL_normal = myPhylo.wholeAlignmentLikelihood(tree, alignment, GTR_sample)
Exemple #32
0
ncbi_to_ott = {}
fi = open(ott_ncbi)

#pickle meeeee
for lin in fi:
    lii = lin.split(",")
    ncbi_to_ott[int(lii[1])] = int(lii[0])

gi_ncbi_map = {}
if os.path.isfile("id_map.txt"):
    fi = open("id_map.txt")
    for lin in fi:
        gi_ncbi_map[int(lin.split(",")[0])] = lin.split(",")[1]

orig_seq = DnaCharacterMatrix.get(path="accs", schema="fasta")

#prune out identical sequences

mapped_taxon_ids = open("id_map.txt", "a")
stops = []
for taxon, seq in orig_seq.items():
    gi = int(taxon.label.split('|')[1])
    if gi in gi_ncbi_map.keys():
        try:
            taxon.label = ncbi_to_ott[int(gi_ncbi_map[gi])]
        except:
            taxon.label = "ncbi_id_{}".format(int(gi_ncbi_map[gi]))
    else:
        try:
            ncbi_id = int(
Exemple #33
0
#Use OpenTree phylesystem identifiers to get study and tree
study_id = "pg_873"
tree_id = "tree1679"
seqaln = "tests/data/minitest.fas"
mattype = "fasta"
workdir = "tests/output/opentree"
configfi = "tests/data/remotencbi.config"


sys.stdout.write("\nTesting 'opentree scrape (1 round)'\n")
conf = physcraper.ConfigObj(configfi, interactive=False)
print "1. {}".format(conf.email)
      
    
aln = DnaCharacterMatrix.get(path=seqaln, schema=mattype)
data_obj = physcraper.generate_ATT_from_phylesystem(aln=aln,
                                                    workdir=workdir,
                                                    study_id = study_id,
                                                    tree_id = tree_id,
                                                    phylesystem_loc = conf.phylesystem_loc)



ids =  physcraper.IdDicts(conf, workdir=workdir)


print "3. {}".format(ids.config.email)


data_obj.prune_short()
Exemple #34
0
    """prunes to 1 seq per spp, and fills in missing data for missing spp,
    in preparation for concanteneation, return dict to be made in char matrix"""
    aln_dict = {}
    tmp_dict = {}
    for taxon, seq in physcraper_obj.aln.items():
        aln_dict[taxon.label] = seq
    seqlen = len(seq) #should all be same bc aligned
    for spp_name in spp_dict.keys():
        try:
            otu = random.choice(spp_dict[spp_name])
            tmp_dict[spp_name] = aln_dict[otu]
        except KeyError:
            tmp_dict[spp_name] = "-" * seqlen
    return tmp_dict

aln1 = DnaCharacterMatrix.from_dict(arbitrary_prune_fill(spp_to_otu1, gene1))
aln2 = DnaCharacterMatrix.from_dict(arbitrary_prune_fill(spp_to_otu2, gene2), taxon_namespace = aln1.taxon_namespace)

concat = DnaCharacterMatrix.concatenate([aln1,aln2])
concat.write(path="concat.fas",
            schema="fasta")





#Open the two pyscraper objects
#Merge the alignements on OTT_ID?
#How to force/missing data ...