Exemple #1
0
    def __init__(self):
    NewEntityExtractor.__init__(self)

    def getTaskName(self):
    return "extract-places"

    def process_name(self, name_elt):
    """
    Some texts, like the hand-tagged version of Herodotus in Ehglish, may
    have <name type="place"> tags instead of <placeName> tags. Happily,
    attributes of either tag should be identical.
    """
    type = name_elt.getAttribute("type")
    if type != "place":
        return None
    else:
        return self.process_placeName(name_elt)
    def process_placeName(self, place_elt):

    key = place_elt.getAttribute("key").split(";")[0]
    if key != "":
        place = self.manager.getEntityByAuthName(key)
        if place != None:
        #print "Looked up a place with key %s: %s" % \
            #(key, place.getDisplayName())
        return place
    
    display_name = place_elt.getAttribute("n")
    reg = place_elt.getAttribute("reg")
    child_text = place_elt.getTextContent()

    for v in [display_name, reg, child_text]:
        matches = self.manager.getMatchingEntities(v, Place)
        if len(matches) > 0:
        return matches[0]
        #return self.create_occurrence(matches[0], place_elt)
    print "Couldn't match place '%s'. :(" % place_elt.getTextContent()
    return None

    def get_entity_classes(self):
    return [Place]

if __name__ == "__main__":
    npe = NewPlaceExtractor()
    if len(sys.argv) > 1:
    for i in range(1, len(sys.argv)):
        npe.processAnything(sys.argv[i])
    else:
    npe.processCorpus()
Exemple #2
0
    def __init__(self):
    NewEntityExtractor.__init__(self)

    self.person_re = re.compile(".*?:(?P<name>[^:]*)(?::.*)?")

    def getTaskName(self):
    return "extract-people"

    def process_persName(self, element):
    #print "Trying to process persname: %s" % element.getTextContent()
    reg = element.getAttribute("reg")
    if reg == "":
        print "Empty reg attribute for %s! Skipping..." % \
        element.getTextContent()
        return None
    match_result = self.person_re.match(reg)
    name_string = match_result.group("name")

    name_tokens = name_string.split(",")
    surname = name_tokens[0]
    forenames = filter(None, \
            [n for n in name_tokens[1:] if n != "nomatch"])

    person = Person()
    for f in forenames:
        if len(f.strip()) > 0: person.addName(person.FORENAME, f)
    person.addName(person.SURNAME, surname)

    print "%s -> %s" % (element.getTextContent(), person.getDisplayName())
    return person

    def get_entity_classes(self):
    return [Person]

if __name__ == "__main__":
    conv = NewPersonExtractor()
    if len(sys.argv) > 1:
    for i in range(1, len(sys.argv)):
        conv.processAnything(sys.argv[i])
    else:
    conv.processCorpus()
Exemple #3
0
    def __init__(self):
    NewEntityExtractor.__init__(self)
    self.parser = CivilWarDateParser()
    self.bad_dates = []
    
    def getTaskName(self):
    return "extract-dates"

    def process_date(self, element):
    date_text = element.getAttribute("value")
    if date_text == "":
        children = entity_utils.children_as_list(element)
        date_text = "".join([c.getNodeValue() for c in children \
                    if c.getNodeType() == c.TEXT_NODE])
    
    if date_text is None or date_text == "":
        raise IllegalArgumentException

    try:
        return self.parser.parse(date_text)
    except NumberFormatException:
        print "Problem parsing date: %s" % date_text
        self.bad_dates.append((date_text, "date"))
        return None

    def process_dateStruct(self, element):
    date_value = element.getAttribute("value")
    if date_value != "":
        try:
        return self.parser.parse(date_value)
        except NumberFormatException:
        print "Problem parsing date-struct: %s" % date_value
        self.bad_dates.append((date_value, "dateStruct_value"))
    
    values = {"year": 0, "month": 0, "day": 0}

    for c in filter(lambda x: x.getNodeType() == x.ELEMENT_NODE, \
            entity_utils.children_as_list(element)):
        name = c.getLocalName()
        if values.has_key(name): values[name] = c.getAttribute("reg")

    return Date(values["year"], values["month"], values["day"])

    def process_dateRange(self, element):
    start = element.getAttribute("from")
    end = element.getAttribute("to")
    try:
        return DateRange(self.parser.parse(start), self.parser.parse(end))
    except NumberFormatException:
        print "Problem parsing date-range: %s, %s" % (start, end)
        self.bad_dates.append(((start, end), "dateRange"))
        return None

    def get_entity_classes(self):
    return [Date, DateRange]

if __name__ == "__main__":
    nde = NewDateExtractor()
    if len(sys.argv) > 1:
    for i in range(1, len(sys.argv)):
        nde.processAnything(sys.argv[i])
    else:
    nde.processCorpus()
    print "The following date strings couldn't be parsed:"
    for bd in nde.bad_dates: print bd