def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, config_obj, schema='newick', taxon_namespace=None): debug("build ATT class") self.aln = alignment assert isinstance(self.aln, datamodel.charmatrixmodel.DnaCharacterMatrix), \ ("your aln '%s' is not a DnaCharacterMatrix" % alignment) self.tre = Tree.get(data=tree, schema=schema, preserve_underscores=True, taxon_namespace=self.aln.taxon_namespace) assert (self.tre.taxon_namespace is self.aln.taxon_namespace), "tre and aln taxon_namespace are not identical" assert isinstance(otu_dict, dict), ("otu_dict '%s' is not of type dict" % otu_dict) self.otu_dict = otu_dict self.config = config_obj self.ps_otu = 1 # iterator for new otu IDs self._reconcile() self._reconcile_names() self.workdir = os.path.abspath(workdir) if not os.path.exists(self.workdir): os.makedirs(self.workdir) assert int(ingroup_mrca), ("your ingroup_mrca '%s' is not an integer." % ingroup_mrca) self.mrca_ott = ingroup_mrca # ott_ingroup mrca can be pulled directly from phylesystem self.orig_seqlen = [] # will get filled in later... self.gb_dict = {} # has all info about new blast seq self._reconciled = False self.unpubl_otu_json = None
def match_id_to_mrca(self, tax_id, mrca_id): """ Recursive function to find out if tax_id is part of mrca_id. """ # debug("match_id_to_mrca") if nodes is None: self.initialize() # debug("testing if {} within {}".format(tax_id, mrca_id)) current_id = int(tax_id) mrca_id = int(mrca_id) #debug([rank_mrca_id, rank_tax_id]) while current_id: if current_id == mrca_id: # debug("found right rank") return True elif current_id == 1: # debug("current id is: {}".format(current_id)) return False elif current_id == 0: debug("current id is: {}, in search for {} in {}".format(current_id, tax_id, mrca_id)) return False else: #try parent try: current_id = int(nodes[nodes["tax_id"] == current_id]["parent_tax_id"].values[0]) except: sys.stderr.write("no parent found for ncbi:id {}".format(current_id)) return False
def get_tax_seq_acc(self, acc): gb_id = acc if len(gb_id.split(".")) == 1: debug("accession number {} not recognized".format(gb_id)) return None, None, None if gb_id in self.acc_tax_seq_dict: tax_name = self.acc_tax_seq_dict[gb_id]["taxname"] ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"] seq = self.acc_tax_seq_dict[gb_id]["seq"] else: read_handle = self.entrez_efetch(gb_id) tax_name = ncbi_data_parser.get_ncbi_tax_name(read_handle) ncbi_id = ncbi_data_parser.get_ncbi_tax_id(read_handle) seq = read_handle[0][u'GBSeq_sequence'] tax_name = tax_name.replace( " ", "_") #TODO check that searches are using names without spaces self.ncbiid_to_spn[ncbi_id] = tax_name self.acc_ncbi_dict[gb_id] = ncbi_id self.acc_tax_seq_dict[gb_id] = { 'taxname': tax_name, "^ncbi:taxon": ncbi_id, 'seq': seq } #This is going to be a memory hog... assert ncbi_id is not None return ncbi_id, tax_name, seq
def get_otu_for_acc(self, gb_id): if gb_id in set([self.otu_dict[otu].get("^ncbi:accession",'UNK') for otu in self.otu_dict]): for otu in self.otu_dict: if self.otu_dict[otu].get("^ncbi:accession") == gb_id: debug("tried to create OTU for {} but already had otu {}".format(gb_id, otu)) return otu else: return None
def _reconcile(self): """Taxa that are only found in the tree, or only in the alignment are deleted. This checks that the tree "original labels" from phylesystem align with those found in the alignment. """ debug("reconcile") treed_tax = set() for leaf in self.tre.leaf_nodes(): treed_tax.add(leaf.taxon) aln_tax = set() for tax, seq in self.aln.items(): aln_tax.add(tax) prune = treed_tax ^ aln_tax missing = [i.label for i in prune] if missing: errmf = 'NAME RECONCILIATION Some of the taxa in the tree are not in the alignment or vice versa' \ ' and will be pruned. Missing "{}"\n' errm = errmf.format('", "'.join(missing)) sys.stderr.write(errm) del_aln = [] del_tre = [] for taxon in prune: assert (taxon in aln_tax) or (taxon in treed_tax) if taxon in aln_tax: del_aln.append(taxon) if taxon in treed_tax: del_tre.append(taxon) self.aln.remove_sequences(del_aln) self.tre.prune_taxa(del_tre) for tax in prune: # potentially slow at large number of taxa and large numbers to be pruned found = 0 for otu in self.otu_dict: if "^ot:originalLabel" in self.otu_dict[otu]: if self.otu_dict[otu][u'^ot:originalLabel'] == tax.label: self.otu_dict[otu]['^physcraper:status'] = "deleted in reconciliation" found = 1 elif otu == tax.label: self.otu_dict[otu]['^physcraper:status'] = "deleted in reconciliation" found = 1 if found == 0: sys.stderr.write("lost taxon {} in reconcilliation \n".format(tax.label)) self.aln.taxon_namespace.remove_taxon(tax) assert self.aln.taxon_namespace == self.tre.taxon_namespace
def get_user_input(): """Asks for yes or no user input. :return: user input """ debug("get user input") is_valid = 0 x = None while not is_valid: try: x = raw_input("Please write either 'yes' or 'no': ") if x == 'yes' or x == 'no': is_valid = 1 # set it to 1 to validate input and to terminate the while..not loop else: print("'%s' is not a valid answer." % x) except ValueError as e: print("'%s' is not a valid answer." % e.args[0].split(": ")[1]) return x
def __init__(self, config_obj, workdir): """Generates a series of name disambiguation dicts""" self.config = config_obj assert self.config.email self.ott_to_ncbi = {} self.ncbi_to_ott = {} # used to get ott_id for new Genbank query taxa self.ott_to_name = {} # used in add_otu to get name from otuId self.acc_ncbi_dict = { } # filled by ncbi_parser (by subprocess in earlier versions of the code). self.spn_to_ncbiid = { } # spn to ncbi_id, it's only fed by the ncbi_data_parser, but makes it faster self.ncbiid_to_spn = { } #TODO when is this generated? MK: well, here. it is filled with information from genbank to speed up translation between ncbi_taxon_ids and names. similar to acc_ncbi_dict and spn_to_ncbiid. tax_folder = os.path.dirname(config_obj.ott_ncbi) fi = open( config_obj.ott_ncbi ) # This is in the taxonomy folder of the repo, needs to be updated by devs when OpenTree taxonomy changes. for lin in fi: lii = lin.split(",") self.ott_to_ncbi[int(lii[0])] = int(lii[1]) self.ott_to_name[int( lii[0])] = lii[2].strip() # todo merge into ott_to_ncbi? fi.close() fi = open("{}/ncbi_ott".format(tax_folder)) for lin in fi: lii = lin.split(",") self.ncbi_to_ott[int(lii[0])] = int(lii[1]) fi.close() assert len(self.ott_to_ncbi) > 0 assert len(self.ott_to_name) > 0 assert len(self.ncbi_to_ott) > 1000 if config_obj.blast_loc == 'remote': debug("Config remote {}".format(config_obj.blast_loc)) self.otu_rank = { } # used only for web queries - contains taxonomic hierarchy information else: # ncbi parser contains information about spn, tax_id, and ranks debug("Config not remote {}".format(config_obj.blast_loc)) self.ncbi_parser = ncbi_data_parser.Parser( names_file=self.config.ncbi_parser_names_fn, nodes_file=self.config.ncbi_parser_nodes_fn) self.acc_tax_seq_dict = {}
def write_labelled(self, label, filename = "labelled", direc='workdir', norepeats=True, add_gb_id=False): """output tree and alignment with human readable labels Jumps through a bunch of hoops to make labels unique. NOT MEMORY EFFICIENT AT ALL Has different options available for different desired outputs :param label: which information shall be displayed in labelled files: possible options: '^ot:ottTaxonName', '^user:TaxonName', "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon" :param treepath: optional: full file name (including path) for phylogeny :param alnpath: optional: full file name (including path) for alignment :param norepeats: optional: if there shall be no duplicate names in the labelled output files :param add_gb_id: optional, to supplement tiplabel with corresponding GenBank sequence identifier :return: writes out labelled phylogeny and alignment to file """ #debug("write labelled files") if direc == 'workdir': direc = self.workdir treepath = "{}/{}".format(direc, "{}.tre".format(filename)) alnpath = "{}/{}".format(direc, '{}.fas'.format(filename)) debug(treepath) assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName', "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"] tmp_newick = self.tre.as_string(schema="newick") tmp_tre = Tree.get(data=tmp_newick, schema="newick", preserve_underscores=True) tmp_fasta = self.aln.as_string(schema="fasta") tmp_aln = DnaCharacterMatrix.get(data=tmp_fasta, schema="fasta", taxon_namespace=tmp_tre.taxon_namespace) new_names = set() for taxon in tmp_tre.taxon_namespace: new_label = self.otu_dict[taxon.label].get(label, None) if new_label is None: if self.otu_dict[taxon.label].get("^ot:originalLabel"): new_label = "orig_{}".format(self.otu_dict[taxon.label]["^ot:originalLabel"]) else: new_label = "ncbi_{}_ottname_{}".format(self.otu_dict[taxon.label].get("^ncbi:taxon", "unk"), self.otu_dict[taxon.label].get('^physcraper:TaxonName', "unk")) new_label = str(new_label).replace(' ', '_') if add_gb_id: gb_id = self.otu_dict[taxon.label].get('^ncbi:accession') if gb_id is None: gb_id = self.otu_dict[taxon.label].get("^ot:originalLabel") new_label = "_".join([new_label, str(gb_id)]) sp_counter = 2 if new_label in new_names and norepeats: new_label = "_".join([new_label, str(sp_counter)]) sp_counter += 1 else: if new_label in new_names and norepeats: new_label = "_".join([new_label, taxon.label]) taxon.label = new_label new_names.add(new_label) tmp_tre.write(path=treepath, schema="newick", unquoted_underscores=True, suppress_edge_lengths=False) tmp_aln.write(path=alnpath, schema="fasta")
def add_otu(self, gb_id, ids_obj): """ Generates an otu_id for new sequences and adds them into self.otu_dict. Needs to be passed an IdDict to do the mapping. :param gb_id: the Genbank identifier/ or local unpublished :param ids_obj: needs to IDs class to have access to the taxonomic information :return: the unique otu_id - the key from self.otu_dict of the corresponding sequence """ # debug("add_otu function") otu_id = self.get_otu_for_acc(gb_id) if otu_id: return otu_id otu_id = "otuPS{}".format(self.ps_otu) self.ps_otu += 1 ott_id = None #debug("trying to add an otu with accesion {}".format(gb_id)) ncbi_id, tax_name = ncbi_data_parser.get_tax_info_from_acc(gb_id, self, ids_obj) if ncbi_id == None: debug("DID NOT ADD accession {} ncbi_id {}".format(gb_id, ncbi_id, tax_name)) return None else: ncbi_id = int(ncbi_id) if ncbi_id in ids_obj.ncbi_to_ott.keys(): #debug("ADDED OTU: accession {} ncbi_id {}".format(gb_id, ncbi_id, tax_name)) ott_id = int(ids_obj.ncbi_to_ott[ncbi_id]) else: debug("{} Ncbi id not found in ott_ncbi dictionaries\n".format(ncbi_id)) ott_id = None if ott_id in ids_obj.ott_to_name: ott_name = ids_obj.ott_to_name[ott_id] else: ott_name = None self.otu_dict[otu_id] = {} self.otu_dict[otu_id]["^ncbi:title"] = self.gb_dict[gb_id]["title"] self.otu_dict[otu_id]["^ncbi:taxon"] = ncbi_id self.otu_dict[otu_id]["^ncbi:TaxonName"] = tax_name self.otu_dict[otu_id]["^ot:ottId"] = ott_id self.otu_dict[otu_id]["^physcraper:status"] = "query" self.otu_dict[otu_id]["^ot:ottTaxonName"] = ott_name self.otu_dict[otu_id]["^physcraper:last_blasted"] = None if gb_id[:6] == "unpubl": self.otu_dict[otu_id]["^physcraper:status"] = "local seq" self.otu_dict[otu_id]["^ot:originalLabel"] = self.gb_dict[gb_id]["localID"] self.otu_dict[otu_id]['^user:TaxonName'] = self.gb_dict[gb_id][u'^user:TaxonName'] else: self.otu_dict[otu_id]["^ncbi:gi"] = self.gb_dict[gb_id]["^ncbi:gi"] self.otu_dict[otu_id]["^ncbi:accession"] = gb_id # get a name for the OTU, no matter from which source if tax_name is not None: self.otu_dict[otu_id]["^physcraper:TaxonName"] = tax_name elif ott_name is not None: self.otu_dict[otu_id]["^physcraper:TaxonName"] = ott_name elif self.otu_dict[otu_id].get('^user:TaxonName'): self.otu_dict[otu_id]["^physcraper:TaxonName"] = self.otu_dict[otu_id]['^user:TaxonName'] else: self.otu_dict[otu_id]["^physcraper:TaxonName"] = "ACC_{}".format(gb_id) assert self.otu_dict[otu_id]["^physcraper:TaxonName"] # is not None if _DEBUG >= 2: sys.stderr.write("acc:{} assigned new otu: {}\n".format(gb_id, otu_id)) #debug("RETURNED OTU_ID {}".format(otu_id)) return otu_id