def stem_statistics(): stem = Name.filter(Name.group == db.constants.GROUP_GENUS, ~(Name.stem >> None)).count() gender = Name.filter(Name.group == db.constants.GROUP_GENUS, ~(Name.gender >> None)).count() total = Name.filter(Name.group == db.constants.GROUP_GENUS).count() print("Genus-group names:") print("stem: %s/%s (%.02f%%)" % (stem, total, stem / total * 100)) print("gender: %s/%s (%.02f%%)" % (gender, total, gender / total * 100)) print("Family-group names:") total = Name.filter(Name.group == db.constants.GROUP_FAMILY).count() typ = Name.filter(Name.group == db.constants.GROUP_FAMILY, ~(Name.type >> None)).count() print("type: %s/%s (%.02f%%)" % (typ, total, typ / total * 100))
def print_percentages(): attributes = ["original_name", "original_citation", "page_described", "authority", "year"] parent_of_taxon = {} def _find_parent(taxon): if taxon.is_page_root: return taxon.id elif taxon.id in parent_of_taxon: return parent_of_taxon[taxon.id] else: return _find_parent(taxon.parent) for taxon in Taxon.select(): parent_of_taxon[taxon.id] = _find_parent(taxon) counts_of_parent = collections.defaultdict(lambda: collections.defaultdict(int)) for name in Name.select(): parent_id = parent_of_taxon[name.taxon.id] counts_of_parent[parent_id]["total"] += 1 for attribute in attributes: if getattr(name, attribute) is not None: counts_of_parent[parent_id][attribute] += 1 for parent_id, data in counts_of_parent.items(): parent = Taxon.filter(Taxon.id == parent_id)[0] print("FILE", parent) total = data["total"] del data["total"] print("Total", total) for attribute in attributes: percentage = data[attribute] * 100.0 / total print("%s: %s (%.2f%%)" % (attribute, data[attribute], percentage))
def detect_types(max_count=None, verbose=False): """Converts verbatim_types into references to the actual names.""" count = 0 successful_count = 0 group = (db.constants.GROUP_FAMILY, db.constants.GROUP_GENUS) for name in Name.filter(Name.verbatim_type != None, Name.type >> None, Name.group << group).limit(max_count): count += 1 if name.detect_and_set_type(verbatim_type=name.verbatim_type, verbose=verbose): successful_count += 1 print("Success: %d/%d" % (successful_count, count))
def add_page_described(): for name in Name.filter( Name.original_citation != None, Name.page_described >> None, Name.year != "in press" ).order_by(Name.original_citation): message = "Name %s is missing page described, but has original citation {%s}" % ( name.description(), name.original_citation, ) name.page_described = getinput.get_line( message, handlers={"o": lambda _: name.open_description()}, should_stop=lambda line: line == "s" ) name.save()
def detect_from_root_name(name, root_name): candidates = Name.filter( Name.group == db.constants.GROUP_GENUS, (Name.stem == root_name) | (Name.stem == root_name + "i") ) candidates = list(filter(lambda c: c.taxon.is_child_of(name.taxon), candidates)) if len(candidates) == 1: print("Detected type for name %s: %s" % (name, candidates[0])) name.type = candidates[0] name.save() return True else: return False
def detect_stems(): for name in Name.filter(Name.group == db.constants.GROUP_GENUS, Name.stem >> None): inferred = db.detection.detect_stem_and_gender(name.root_name) if inferred is None: continue if not inferred.confident: print("%s: stem %s, gender %s" % (name.description(), inferred.stem, inferred.gender)) if not getinput.yes_no("Is this correct? "): continue print("Inferred stem and gender for %s: %s, %s" % (name, inferred.stem, inferred.gender)) name.stem = inferred.stem name.gender = inferred.gender name.save()
def add_types(): for name in Name.filter( Name.original_citation != None, Name.type >> None, Name.year > "1930", Name.group == db.constants.GROUP_GENUS ).order_by(Name.original_citation): name.taxon.display(full=True, max_depth=1) message = "Name %s is missing type, but has original citation {%s}" % ( name.description(), name.original_citation, ) verbatim_type = getinput.get_line( message, handlers={"o": lambda _: name.open_description()}, should_stop=lambda line: line == "s" ) if verbatim_type is not None: name.detect_and_set_type(verbatim_type, verbose=True)
def add_original_names(): for name in Name.select(): if name.original_citation and not name.original_name: message = u"Name {} is missing an original name, but has original citation {{{}}}:{}".format( name.description(), name.original_citation, name.page_described ) name.original_name = getinput.get_line(message, handlers={"o": lambda _: name.open_description()}) if not name.page_described: name.page_described = getinput.get_line( "Enter page described", handlers={"o": lambda _: name.open_description()}, should_stop=lambda line: line == "s", ) name.save()
def check_refs(): for name in Name.select(): # if there is an original_citation, check whether it is valid if name.original_citation: if not cite_exists(name.original_citation): print("Name:", name.description()) print("Warning: invalid original citation:", name.original_citation) elif name.verbatim_citation and may_be_citation(name.verbatim_citation): if cite_exists(name.verbatim_citation): name.original_citation = name.verbatim_citation name.verbatim_citation = None name.save() elif must_be_citation(name.verbatim_citation): print("Name:", name.description()) print("Warning: invalid citation:", name.verbatim_citation)
def root_name_mismatch(): for name in Name.filter(Name.group == db.constants.GROUP_FAMILY, ~(Name.type >> None)): if name.is_unavailable(): continue stem_name = name.type.stem if stem_name is None: continue if name.root_name == stem_name: continue for stripped in db.helpers.name_with_suffixes_removed(name.root_name): if stripped == stem_name or stripped + "i" == stem_name: print("Autocorrecting root name: %s -> %s" % (name.root_name, stem_name)) name.root_name = stem_name name.save() break if name.root_name != stem_name: print("Stem mismatch for %s: %s vs. %s" % (name, name.root_name, stem_name)) yield name
def detect_types_from_root_names(max_count=None): """Detects types for family-group names on the basis of the root_name.""" def detect_from_root_name(name, root_name): candidates = Name.filter( Name.group == db.constants.GROUP_GENUS, (Name.stem == root_name) | (Name.stem == root_name + "i") ) candidates = list(filter(lambda c: c.taxon.is_child_of(name.taxon), candidates)) if len(candidates) == 1: print("Detected type for name %s: %s" % (name, candidates[0])) name.type = candidates[0] name.save() return True else: return False count = 0 successful_count = 0 for name in ( Name.filter(Name.group == db.constants.GROUP_FAMILY, Name.type >> None) .order_by(Name.id.desc()) .limit(max_count) ): if name.is_unavailable(): continue count += 1 if detect_from_root_name(name, name.root_name): successful_count += 1 else: for stripped in db.helpers.name_with_suffixes_removed(name.root_name): if detect_from_root_name(name, stripped): successful_count += 1 break else: print("Could not detect type for name %s (root_name = %s)" % (name, name.root_name)) print("Success: %d/%d" % (successful_count, count))
def dup_names(): original_year = collections.defaultdict(list) for name in Name.select(): if name.original_name is not None and name.year is not None: original_year[(name.original_name, name.year)].append(name) return [original_year]
def dup_genus(): names = collections.defaultdict(list) for name in Name.filter(Name.group == db.constants.GROUP_GENUS): full_name = "%s %s, %s" % (name.root_name, name.authority, name.year) names[full_name].append(name) return [names]
def endswith(end): return list(Name.filter(Name.group == db.constants.GROUP_GENUS, Name.root_name % ("%%%s" % end)))
def n(name): """Finds names with the given root name or original name.""" return list(Name.filter((Name.root_name == name) | (Name.original_name == name)))
def bad_taxa(): return Name.raw("SELECT * FROM name WHERE taxon_id IS NULL or taxon_id NOT IN (SELECT id FROM taxon)")
def read_file(filename): with codecs.open(filename, mode='r') as file: reader = csv.reader(file) first_line = reader.next() # name of parent of root taxon should be in cell A1 root_name = first_line[0] if root_name: root_parent = Taxon.filter(Taxon.valid_name == root_name)[0] # maintain stack of taxa that are parents of the current taxon stack = [root_parent] else: stack = [] # current valid taxon (for synonyms) current_valid = None # whether current taxon should be marked as root of a page is_page_root = True error_occurred = False for row in reader: try: # ignore blank rows if row[3] == '' and row[0] == '': continue data = parse_row(row) if data['status'] == STATUS_VALID: # get stuff off the stack rank = data['rank'] # TODO: make this somehow unranked-clade-aware while len(stack) > 0 and rank >= stack[-1].rank: stack.pop() # create new Taxon current_valid = Taxon.create(valid_name=data['valid_name'], age=data['age'], rank=data['rank'], is_page_root=is_page_root, comments=data['comments_taxon'], data=data['data_taxon']) if len(stack) > 0: current_valid.parent = stack[-1] if is_page_root: is_page_root = False stack.append(current_valid) # create new Name data['taxon'] = current_valid assert current_valid.valid_name == data['valid_name'], \ "Valid name %s does not match expected %s" % (data['valid_name'], current_valid.valid_name) data['data'] = helpers.fix_data(data['data']) # Detect whether a name object is already present (Principle of Coordination) nm = None if data['root_name'][0:4] == 'see ': seen = data['root_name'][4:] nm = Taxon.get(Taxon.valid_name == seen).base_name # create a new Name if none was found if nm is None: nm = Name.create(**data) # set base_name field if data['status'] == STATUS_VALID: current_valid.base_name = nm except Exception: traceback.print_exc() print('Error parsing row: %s' % row) error_occurred = True # ignore error and happily go on with the next return not error_occurred