def __init__(self, eid, phr, cc, is_valid): Taxon.__init__(self) self.entity_id = eid self.country = cc self.phrase = phr self.phrasenorm = phr.lower() self.is_valid = is_valid # taxon ID/name: self.name = eid self.tags = self._make_tags()
def __init__(self, eid, variant_id, etype, lang, primary_name, ename): Taxon.__init__(self) # JRC original entity ID and type self.entity_id = eid self.entity_type = etype.upper() self.lang = lang self.phrase = ename self.phrasenorm = ename.lower() if self.phrase in FIXES: self.entity_type = FIXES.get(self.phrase) if apply_default_fixes: tokens = self.phrasenorm.split() if tokens[-1] in PLACE_ENDING_FIXES: # Place (T=terrain) self.entity_type = 'T' print "Place Phrase fixed", self.phrase elif tokens[0] in PLACE_STARTING_FIXES: self.entity_type = 'T' print "Place Phrase fixed", self.phrase elif tokens[-1] in ignore_provinces: self.entity_type = 'T' print "Ignore Province name in token", self.phrase if self.entity_type == 'P': for tok in tokens: if tok in ORG_FIXES: self.entity_type = 'O' print "Org Phrase fixed", self.phrase break; if self.entity_type in entity_map: self.entity_type = entity_map[self.entity_type] self.variant_id = variant_id # solr record ID: self.id = self._make_id() self.is_valid = True self.is_acronym = ename.isupper() and is_ascii(ename) # taxon ID/name: self.name = '%s.%s' % (self.entity_type, primary_name) self.tags = self._make_tags()
def create_entity(name): ''' Create a generic person name taxon, rather than a particular personality/celebrity ''' taxon = Taxon() n = name.strip().lower() taxon.name = 'person_name.{}'.format(n) taxon.phrase = n taxon.phrasenorm = taxon.phrase # Nothing more to normalize. taxon.is_valid = True taxon.tags = [] #if n in non_person_names: # taxon.is_valid = False return taxon