def __init__(self): NewEntityExtractor.__init__(self) def getTaskName(self): return "extract-places" def process_name(self, name_elt): """ Some texts, like the hand-tagged version of Herodotus in Ehglish, may have <name type="place"> tags instead of <placeName> tags. Happily, attributes of either tag should be identical. """ type = name_elt.getAttribute("type") if type != "place": return None else: return self.process_placeName(name_elt) def process_placeName(self, place_elt): key = place_elt.getAttribute("key").split(";")[0] if key != "": place = self.manager.getEntityByAuthName(key) if place != None: #print "Looked up a place with key %s: %s" % \ #(key, place.getDisplayName()) return place display_name = place_elt.getAttribute("n") reg = place_elt.getAttribute("reg") child_text = place_elt.getTextContent() for v in [display_name, reg, child_text]: matches = self.manager.getMatchingEntities(v, Place) if len(matches) > 0: return matches[0] #return self.create_occurrence(matches[0], place_elt) print "Couldn't match place '%s'. :(" % place_elt.getTextContent() return None def get_entity_classes(self): return [Place] if __name__ == "__main__": npe = NewPlaceExtractor() if len(sys.argv) > 1: for i in range(1, len(sys.argv)): npe.processAnything(sys.argv[i]) else: npe.processCorpus()
def __init__(self): NewEntityExtractor.__init__(self) self.person_re = re.compile(".*?:(?P<name>[^:]*)(?::.*)?") def getTaskName(self): return "extract-people" def process_persName(self, element): #print "Trying to process persname: %s" % element.getTextContent() reg = element.getAttribute("reg") if reg == "": print "Empty reg attribute for %s! Skipping..." % \ element.getTextContent() return None match_result = self.person_re.match(reg) name_string = match_result.group("name") name_tokens = name_string.split(",") surname = name_tokens[0] forenames = filter(None, \ [n for n in name_tokens[1:] if n != "nomatch"]) person = Person() for f in forenames: if len(f.strip()) > 0: person.addName(person.FORENAME, f) person.addName(person.SURNAME, surname) print "%s -> %s" % (element.getTextContent(), person.getDisplayName()) return person def get_entity_classes(self): return [Person] if __name__ == "__main__": conv = NewPersonExtractor() if len(sys.argv) > 1: for i in range(1, len(sys.argv)): conv.processAnything(sys.argv[i]) else: conv.processCorpus()
def __init__(self): NewEntityExtractor.__init__(self) self.parser = CivilWarDateParser() self.bad_dates = [] def getTaskName(self): return "extract-dates" def process_date(self, element): date_text = element.getAttribute("value") if date_text == "": children = entity_utils.children_as_list(element) date_text = "".join([c.getNodeValue() for c in children \ if c.getNodeType() == c.TEXT_NODE]) if date_text is None or date_text == "": raise IllegalArgumentException try: return self.parser.parse(date_text) except NumberFormatException: print "Problem parsing date: %s" % date_text self.bad_dates.append((date_text, "date")) return None def process_dateStruct(self, element): date_value = element.getAttribute("value") if date_value != "": try: return self.parser.parse(date_value) except NumberFormatException: print "Problem parsing date-struct: %s" % date_value self.bad_dates.append((date_value, "dateStruct_value")) values = {"year": 0, "month": 0, "day": 0} for c in filter(lambda x: x.getNodeType() == x.ELEMENT_NODE, \ entity_utils.children_as_list(element)): name = c.getLocalName() if values.has_key(name): values[name] = c.getAttribute("reg") return Date(values["year"], values["month"], values["day"]) def process_dateRange(self, element): start = element.getAttribute("from") end = element.getAttribute("to") try: return DateRange(self.parser.parse(start), self.parser.parse(end)) except NumberFormatException: print "Problem parsing date-range: %s, %s" % (start, end) self.bad_dates.append(((start, end), "dateRange")) return None def get_entity_classes(self): return [Date, DateRange] if __name__ == "__main__": nde = NewDateExtractor() if len(sys.argv) > 1: for i in range(1, len(sys.argv)): nde.processAnything(sys.argv[i]) else: nde.processCorpus() print "The following date strings couldn't be parsed:" for bd in nde.bad_dates: print bd