def print_percentages(): attributes = ["original_name", "original_citation", "page_described", "authority", "year"] parent_of_taxon = {} def _find_parent(taxon): if taxon.is_page_root: return taxon.id elif taxon.id in parent_of_taxon: return parent_of_taxon[taxon.id] else: return _find_parent(taxon.parent) for taxon in Taxon.select(): parent_of_taxon[taxon.id] = _find_parent(taxon) counts_of_parent = collections.defaultdict(lambda: collections.defaultdict(int)) for name in Name.select(): parent_id = parent_of_taxon[name.taxon.id] counts_of_parent[parent_id]["total"] += 1 for attribute in attributes: if getattr(name, attribute) is not None: counts_of_parent[parent_id][attribute] += 1 for parent_id, data in counts_of_parent.items(): parent = Taxon.filter(Taxon.id == parent_id)[0] print("FILE", parent) total = data["total"] del data["total"] print("Total", total) for attribute in attributes: percentage = data[attribute] * 100.0 / total print("%s: %s (%.2f%%)" % (attribute, data[attribute], percentage))
def dup_taxa(): taxa = collections.defaultdict(list) for txn in Taxon.select(): if txn.rank == db.constants.SUBGENUS and len(taxa[txn.valid_name]) > 0: continue taxa[txn.valid_name].append(txn) return [taxa]
def find_rank_mismatch(): for taxon in Taxon.select(): expected_group = db.helpers.group_of_rank(taxon.rank) if expected_group != taxon.base_name.group: rank = db.constants.string_of_rank(taxon.rank) group = db.constants.string_of_group(taxon.base_name.group) print("Group mismatch for %s: rank %s but group %s" % (taxon, rank, group)) yield taxon
def keys(self): keys = set(super(_ShellNamespace, self).keys()) keys |= set(dir(__builtins__)) if not hasattr(self, "_names"): self._names = set( _encode_name(taxon.valid_name) for taxon in Taxon.select(Taxon.valid_name) if taxon.valid_name is not None ) return keys | self._names
def name_mismatches(max_count=None, correct=False, correct_undoubted=True): count = 0 for taxon in Taxon.select(): computed = taxon.compute_valid_name() if computed is not None and taxon.valid_name != computed: print("Mismatch for %s: %s (actual) vs. %s (computed)" % (taxon, taxon.valid_name, computed)) yield taxon count += 1 # for species-group taxa with a known genus parent, the computed valid name is almost # always right (the mismatch will usually happen after a change in genus classification) # one area that isn't well-covered yet is autocorrecting gender endings if ( correct_undoubted and taxon.base_name.group == db.constants.GROUP_SPECIES and taxon.has_parent_of_rank(db.constants.GENUS) ): taxon.recompute_name() elif correct: taxon.recompute_name() if max_count is not None and count == max_count: return