def _search(self, taxids): """Search GenBank for matches, increasing thoroughness if fewer matches than target nseqs""" # record seqids of all sequences that match query # move to next thoroughness if too few sequences # if _search is run again, will not return seqids of sequences # that have already been found -- self.deja_vues records seen # sequences # limit number of taxids to 5, search recursively if more than 5 if len(taxids) > 5: res = self._search(taxids[:5]) self.thoroughness = 1 res.extend(self._search(taxids[5:])) return res seqids = [] while len(seqids) < self.nseqs: if self.thoroughness > self.max_thoroughness: # keep searching until enough sequences have been # found or until the max thoroughness has been hit break search_term = self._buildSearchTerm(taxids, self.thoroughness) seqcount = etools.eSearch(search_term, logger=self.logger)['Count'] if int(seqcount) >= 1: # return ALL matching seqids if more than 0 seqids.extend(etools.eSearch(search_term, logger=self.logger, retMax=seqcount)['IdList']) # filter those that have already been seen seqids = [e for e in seqids if e not in self.deja_vues] self.thoroughness += 1 self.deja_vues.extend(seqids) self.deja_vues = list(set(self.deja_vues)) return list(set(seqids))
def getOutgroup(namesdict, parentid, logger, outgroupid=None, minrecords=1000): """Return namesdict with suitable outgroup""" # TODO: too complex, consider breaking up def findParent(parentid): return etools.eFetch(parentid, logger=logger, db="taxonomy")[0]['ParentTaxId'] def getTaxIdMetaData(ncbi_id): etal_bool = False if len(ncbi_id) > 1: ncbi_id = ncbi_id[0] etal_bool = True record = etools.eFetch(ncbi_id, logger=logger, db="taxonomy")[0] metadata = [record['Rank'], record['ScientificName']] if etal_bool: metadata = [e + ' et al.' for e in metadata] return metadata[0], metadata[1] # loop until a suitable outgroup is found. Criteria are: # 1. ids returned must belong to a sister group of all ids of # names given # 2. ids must have nucleotide data (i.e.avoid returning extinct organisms) # assumptions: # 1. NCBI taxonomy is not paraphyletic # make sure parentid is string if not outgroupid: parentid = str(parentid) outgroup_ids = [] while not outgroup_ids: # if parent id are Cellular Orgs, likely name resolution error # or names given are too diverse if parentid == '131567': raise TaxonomicRankError() # get parent of parent grandparentid = findParent(parentid) # find all children candidates = etools.findChildren(grandparentid, logger=logger, next=True) # filter out children that are in ingroup candidates = [e for e in candidates if e != parentid] # search genbank for nuc records for candidate in candidates: term = 'txid' + str(candidate) + '[PORGN]' nuc_record = etools.eSearch(term, logger=logger) # there must be more than 1000 nuc records if int(nuc_record['Count']) > minrecords: outgroup_ids.append(candidate) # make grandparentid the new parentid parentid = grandparentid else: outgroup_ids = [outgroupid] # add outgroup_ids to namesdict rank, unique_name = getTaxIdMetaData(outgroup_ids) # convert to ints outgroup_ids = [int(e) for e in outgroup_ids] namesdict["outgroup"] = {"txids": outgroup_ids, "unique_name": unique_name, "rank": rank} return namesdict