Beispiel #1
0
 def __init__(self, session, node, parent):
     SimpleExtractor.__init__(self, session, node, parent)
     if nltk is None:
         raise MissingDependencyException(self.objectType, "nltk")
     # Load types from config
     types = self.get_setting(session, "entityTypes")
     if types:
         self.types = []
         for type_ in types.split():
             type_ = type_.lower()
             if type_.startswith("pe"):
                 self.types.append("PERSON")
             elif type_.startswith(("pl", "g")):
                 self.types.append("GPE")
             elif type_.startswith(("org", "co")):
                 self.types.append("ORGANIZATION")
             else:
                 msg = "Unknown entity type setting {0} on {1} {2}" "".format(
                     type_, self.__class__.__name__, self.id
                 )
                 raise ConfigFileException(msg)
     else:
         # Default to all
         self.types = ["PERSON", "GPE", "ORGANIZATION"]
     # Should we keep the /POS tag or strip it
     self.keepPos = self.get_setting(session, "pos", 0)
Beispiel #2
0
 def __init__(self, session, node, parent):
     SimpleExtractor.__init__(self, session, node, parent)
     if nltk is None:
         raise MissingDependencyException(self.objectType, 'nltk')
     # Load types from config
     types = self.get_setting(session, 'entityTypes')
     if types:
         self.types = []
         for type_ in types.split():
             type_ = type_.lower()
             if type_.startswith('pe'):
                 self.types.append('PERSON')
             elif type_.startswith(('pl', 'g')):
                 self.types.append('GPE')
             elif type_.startswith(('org', 'co')):
                 self.types.append('ORGANIZATION')
             else:
                 msg = ("Unknown entity type setting {0} on {1} {2}"
                        "".format(type_, self.__class__.__name__, self.id))
                 raise ConfigFileException(msg)
     else:
         # Default to all
         self.types = ['PERSON', 'GPE', 'ORGANIZATION']
     # Should we keep the /POS tag or strip it
     self.keepPos = self.get_setting(session, 'pos', 0)
 def __init__(self, session, node, parent):
     SimpleExtractor.__init__(self, session, node, parent)
     # Load types from config
     types = self.get_setting(session, 'entityTypes')
     if types:
         self.types = []
         for type_ in types.split():
             type_ = type_.lower()
             if type_.startswith('pe'):
                 self.types.append('PERSON')
             elif type_.startswith(('pl', 'g')):
                 self.types.append('GPE')
             elif type_.startswith(('org', 'co')):
                 self.types.append('ORGANIZATION')
             else:
                 msg = ("Unknown entity type setting {0} on {1} {2}"
                        "".format(type_,
                                  self.__class__.__name__,
                                  self.id)
                        )
                 raise ConfigFileException(msg)
     else:
         # Default to all
         self.types = ['PERSON', 'GPE', 'ORGANIZATION']
     # Should we keep the /POS tag or strip it
     self.keepPos = self.get_setting(session, 'pos', 0)
Beispiel #4
0
    def __init__(self, session, config, parent):
        SimpleExtractor.__init__(self, session, config, parent)
        # default:  <w p="POS" s="STEM" o="OFFSET">TEXT</w>
        #     -->   TEXT/POS/STEM/OFFSET

        # XXX Can we xpathProcessor-ify these xpaths?
        # too computationally expensive to bother?
        xpaths = self.get_setting(session, 'subXpaths', 'word|./text()| pos|./@p|XX stem|./@s|./text() offset|./@o|-1')
        xps = xpaths.split(' ')
        self.xpaths = [x.split('|') for x in xps]
        self.xpath = self.get_setting(session, 'xpath', 'toks/w')
        self.template = self.get_setting(session, 'template', '%(word)s/%(pos)s/%(stem)s/%(offset)s')
Beispiel #5
0
    def __init__(self, session, config, parent):
        SimpleExtractor.__init__(self, session, config, parent)
        # default:  <w p="POS" s="STEM" o="OFFSET">TEXT</w>
        #     -->   TEXT/POS/STEM/OFFSET

        # XXX Can we xpathProcessor-ify these xpaths?
        # too computationally expensive to bother?
        xpaths = self.get_setting(
            session, 'subXpaths',
            'word|./text()| pos|./@p|XX stem|./@s|./text() offset|./@o|-1')
        xps = xpaths.split(' ')
        self.xpaths = [x.split('|') for x in xps]
        self.xpath = self.get_setting(session, 'xpath', 'toks/w')
        self.template = self.get_setting(
            session, 'template', '%(word)s/%(pos)s/%(stem)s/%(offset)s')
Beispiel #6
0
 def process_eventList(self, session, data):
     simpleHash = SimpleExtractor.process_eventList(self, session, data)
     return self._process_simpleHash(simpleHash)
Beispiel #7
0
 def process_string(self, session, data):
     simpleHash = SimpleExtractor.process_string(self, session, data)
     return self._process_simpleHash(simpleHash)
Beispiel #8
0
 def __init__(self, session, config, parent):
     SimpleExtractor.__init__(self, session, config, parent)
     self.jchr = self.get_setting(session, 'joinCharacter', u'  ')
Beispiel #9
0
 def __init__(self, session, config, parent):
     SimpleExtractor.__init__(self, session, config, parent)
     self.jchr = self.get_setting(session, 'joinCharacter', u'  ')
Beispiel #10
0
 def process_string(self, session, data):
     simpleHash = SimpleExtractor.process_string(self, session, data)
     return self._process_simpleHash(simpleHash)
Beispiel #11
0
 def process_eventList(self, session, data):
     simpleHash = SimpleExtractor.process_eventList(self, session, data)
     return self._process_simpleHash(simpleHash)
Beispiel #12
0
 def __init__(self, session, config, parent):
     SimpleExtractor.__init__(self, session, config, parent)
     self.pos = self.get_setting(session, 'pos', 0)
     self.stem = self.get_setting(session, 'stem', 0)
     self.offset = self.get_setting(session, 'offset', 0)
Beispiel #13
0
 def __init__(self, session, config, parent):
     SimpleExtractor.__init__(self, session, config, parent)
     self.pos = self.get_setting(session, 'pos', 0)
     self.stem = self.get_setting(session, 'stem', 0)
     self.offset = self.get_setting(session, 'offset', 0)
Beispiel #14
0
 def __init__(self, session, config, parent):
     SimpleExtractor.__init__(self, session, config, parent)
     raise MissingDependencyException(self.objectType, 'rdflib')
Beispiel #15
0
 def __init__(self, session, config, parent):
     SimpleExtractor.__init__(self, session, config, parent)
     raise MissingDependencyException(self.objectType, 'rdflib')