def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.sumType = self.get_setting(session, 'sumType', 'md5')
     try:
         hashlib.new(self.sumType)
     except ValueError as e:
         raise ConfigFileException(str(e))
Exemple #2
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.sumType = self.get_setting(session, 'sumType', 'md5')
     try:
         hashlib.new(self.sumType)
     except ValueError as e:
         raise ConfigFileException(str(e))
Exemple #3
0
    def __init__(self, session, node, parent):
        PreParser.__init__(self, session, node, parent)

        tp = self.get_path(session, 'executablePath', '')
        exe = self.get_path(session, 'executable', './parser')
        if not tp:
            tp = commands.getoutput('which %s' % exe)
            tp = os.path.dirname(tp)

        tp = os.path.join(tp, exe)

        if not tp:
            raise ConfigFileException("%s requires the path: filePath" %
                                      self.id)

        o = os.getcwd()
        os.chdir(tp)

        o = os.getcwd()
        os.chdir(tp)
        self.pipe = Popen(exe,
                          shell=True,
                          bufsize=1,
                          stdin=PIPE,
                          stdout=PIPE,
                          stderr=PIPE)
        os.chdir(o)
    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
	self.body = re.compile('<body(.*?)</body>', re.S | re.I)
	self.tagstrip = re.compile('<[^>]+>')
	self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I)
	self.script = re.compile('<script(.*?)</script>', re.S | re.I)
	self.style = re.compile('<style(.*?)</style>', re.S | re.I)
	self.comment = re.compile('<!--(.*?)-->', re.S | re.I)
Exemple #5
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.body = re.compile('<body(.*?)</body>', re.S | re.I)
     self.tagstrip = re.compile('<[^>]+>')
     self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I)
     self.script = re.compile('<script(.*?)</script>', re.S | re.I)
     self.style = re.compile('<style(.*?)</style>', re.S | re.I)
     self.comment = re.compile('<!--(.*?)-->', re.S | re.I)
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.doctype_re = (re.compile('<!DOCTYPE\s+?(.+?)["\'](.+?)["\']>'))
     self.attr_re = re.compile(' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])')
     self.pi_re = re.compile("<\?(.*?)\?>")
     self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)')
     self.amp_re = re.compile('&(\s)')
     taglist = self.get_setting(session, 'emptyElements')
     if taglist:
         self.emptyTags = taglist.split()
Exemple #7
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.doctype_re = (re.compile('<!DOCTYPE\s+?(.+?)["\'](.+?)["\']>'))
     self.attr_re = re.compile(
         ' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])')
     self.pi_re = re.compile("<\?(.*?)\?>")
     self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)')
     self.amp_re = re.compile('&(\s)')
     taglist = self.get_setting(session, 'emptyElements')
     if taglist:
         self.emptyTags = taglist.split()
Exemple #8
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     char = self.get_setting(session, 'char')
     regex = self.get_setting(session, 'regexp')
     self.keep = self.get_setting(session, 'keep')
     if regex:
         self.regexp = re.compile(regex, re.S)
     if char:
         self.char = char
     else:
         self.char = ''
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     char = self.get_setting(session, 'char')
     regex = self.get_setting(session, 'regexp')
     self.keep = self.get_setting(session, 'keep')
     if regex:
         self.regexp = re.compile(regex, re.S)
     if char:
         self.char = char
     else:
         self.char = ''
Exemple #10
0
    def __init__(self, session, server, config):
        PreParser.__init__(self, session, server, config)
        self.source_re = re.compile("<open file '(.+?)', mode '.' at .*?>")
        
        # get settings from config
        # Az:  Check existence of settings and fail consistently rather than
        # die half way through execution
        self.mvServerPath = self.get_path(session, 'mvServerPath')
        if self.mvServerPath:
            # they've specified a local path to the server code
            # we should start a server locally with automatically generated port, in local-only mode
            if not os.path.exists(self.mvServerPath):
                raise ConfigFileException('Path type="mvServerPath" does not exist')

            host = '127.0.0.1'
            # find a random free port
            s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            err = True
            while (err):
                err = False
                port = random.randrange(10000)
                try: s.bind((host,port))
                except: err = True

            s.close()
            del s
            mvStdin, mvStdout = os.popen2('java -D64 -Djava.awt.headless=true -Xms40m -Xmx256m -jar %s %d -guess -out xml -link' % (self.mvServerPath, port), 't')
            
        else:
            # get settings for remote mv server
            host = self.get_setting(session, 'host')
            port  = self.get_setting(session, 'port')
            if not port.isdigit():
                raise ConfigFileException("'port' setting for Multivalent preParser must be an integer.")
            
        pack = self.get_setting(session, 'returnPacking')
        if not (host and port and pack):
            raise ConfigFileException("'host', 'port' and 'returnPacking' settings must be set for Multivalent preParser '%s'" % self.id)
            
        self.mvHost = host
        self.mvPort = int(port)
        self.returnPacking = pack.lower()        
        if (self.returnPacking == 'xml'):
            self.outMimeType = 'text/xml'
        else:
            self.outMimeType = 'text/plain'
        # initialise and connect to multivalent client
        self.mvClient = MultivalentClient()
        try:
            self.mvClient.connect(self.mvHost, self.mvPort)
        except:
            # (Try to connect at run time)
            pass
	atexit.register(self.close_mvServer)            
Exemple #11
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     # Some settings that are needed at this stage
     self.offset = self.get_setting(session, 'termOffset', 0)
     filename = self.get_path(session, 'modelPath', None)
     if not filename:
         dfp = self.get_path(session, 'defaultPath')
         filename = os.path.join(dfp, self.id + "_ATTRHASH.pickle")
     self.modelPath = filename
     self.model = {}
     self.lastModTime = 0
     self.load_model(session)
Exemple #12
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     # Some settings that are needed at this stage
     self.offset = self.get_setting(session, 'termOffset', 0)
     filename = self.get_path(session, 'modelPath', None)
     if not filename:
         dfp = self.get_path(session, 'defaultPath')
         filename = os.path.join(dfp, self.id + "_ATTRHASH.pickle")
     self.modelPath = filename
     self.model = {}
     self.lastModTime = 0
     self.load_model(session)
Exemple #13
0
    def __init__(self, session, node, parent):
        PreParser.__init__(self, session, node, parent)
        tp = self.get_path(session, 'executablePath', '')
        exe = self.get_path(session, 'executable', './parser')
        if not tp:
            tp = commands.getoutput('which %s' % exe)
	    tp = os.path.dirname(tp)
        tp = os.path.join(tp, exe)
        if not tp:
            raise ConfigFileException("%s requires the path: filePath" % self.id)
        o = os.getcwd()
        os.chdir(tp)
        o = os.getcwd()
        os.chdir(tp)
        self.pipe = Popen(exe, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        os.chdir(o)
Exemple #14
0
    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.offset = self.get_setting(session, 'termOffset', 0)
        modelPath = self.get_path(session, 'modelPath', '')
        if not modelPath:
            raise ConfigFileException("Classification PreParser (%s) does not have a modelPath" % self.id)
        if (not os.path.isabs(modelPath)):
            dfp = self.get_path(session, 'defaultPath')
            modelPath = os.path.join(dfp, modelPath)
            self.paths['modelPath'] = modelPath
        if os.path.exists(modelPath):
            # load model
            self.load_model(session, modelPath)
        else:
            self.model = None

        self.renumber = {}
Exemple #15
0
    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.offset = self.get_setting(session, 'termOffset', 0)
        modelPath = self.get_path(session, 'modelPath', '')
        if not modelPath:
            raise ConfigFileException("Classification PreParser (%s) does not "
                                      "have a modelPath" % self.id)
        if (not os.path.isabs(modelPath)):
            dfp = self.get_path(session, 'defaultPath')
            modelPath = os.path.join(dfp, modelPath)
            self.paths['modelPath'] = modelPath
        if os.path.exists(modelPath):
            # load model
            self.load_model(session, modelPath)
        else:
            self.model = None

        self.renumber = {}
Exemple #16
0
    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        # need to know which unrenumber preParser to use
        self.unrenumber = self.get_path(session, 'unRenumberPreParser', None)
        self.recordStore = self.get_path(session, 'recordStore', None)
        self.calcRuleLengths = self.get_setting(session, 'calcRuleLengths', 0)
        self.index = self.get_path(session, 'index', None)
        self.calcRankings = self.get_setting(session, 'calcRankings', 0)
        self.sortBy = self.get_setting(session, 'sortBy', '')

        self.sortFuncs = {
            'll': lambda x: x.ll,
            'surprise': lambda x: x.surprise,
            'entropy': lambda x: x.entropy,
            'gini': lambda x: x.gini,
            'length': lambda x: len(x.termids),
            'support': lambda x: x.freq,
            'totalFreq': lambda x: sum(x.freqs)
        }
Exemple #17
0
    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        # need to know which unrenumber preParser to use
        self.unrenumber = self.get_path(session, 'unRenumberPreParser', None)
        self.recordStore = self.get_path(session, 'recordStore', None)
        self.calcRuleLengths = self.get_setting(session, 'calcRuleLengths', 0)
        self.index = self.get_path(session, 'index', None)
        self.calcRankings = self.get_setting(session, 'calcRankings', 0)
        self.sortBy = self.get_setting(session, 'sortBy', '')

        self.sortFuncs = {
            'll' :lambda x: x.ll,
            'surprise' : lambda x: x.surprise,
            'entropy' : lambda x: x.entropy,
            'gini' : lambda x: x.gini,
            'length' : lambda x: len(x.termids),
            'support' : lambda x: x.freq,
            'totalFreq' : lambda x: sum(x.freqs)
            }
Exemple #18
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     # Some settings that are needed at this stage
     self.offset = self.get_setting(session, 'termOffset', 0)
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.normalizer = self.get_path(session, 'normalizer', None)
     if self.normalizer == None:
         raise ConfigFileException("Normalizer for %s does not exist." % self.id)
Exemple #20
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.asciiRe = re.compile('([\x7b-\xff])')
     self.nonxmlRe = re.compile('([\x00-\x08]|[\x0E-\x1F]|[\x0B\x0C\x1F])')
     self.strip = self.get_setting(session, 'strip', 0)
Exemple #21
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.stem = self.get_setting(session, 'useStem', 0)
     self.pos = self.get_setting(session, 'pos', 0)
     self.puncre = re.compile('[ ]([.,;:?!][ \n])')
    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)

        self.numericalEntRe = re.compile('&(\d+);')
        self.fractionRe = re.compile('&frac(\d)(\d);')
        self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);')

        self.start = 160
        self.otherEntities = {
            "quot": '#34',
            "amp": '#38',
            "lt": '#60',
            "gt": '#62',
            "trade" : '#8482',
            "OElig": '#338',
            "oelig": '#339',
            "Scaron": '#352',
            "scaron": '#353',
            "Yuml": '#376',
            "circ": '#710',
            "tilde": '#732',
            "ensp": '#8194',
            "emsp": '#8195',
            "thinsp": '#8201',
            "zwnj": '#8204',
            "zwj": '#8205',
            "lrm": '#8206',
            "rlm": '#8207',
            "ndash": '#8211',
            "mdash": '#8212',
            "lsquo": '#8216',
            "rsquo": '#8217',
            "sbquo": '#8218',
            "ldquo": '#8220',
            "rdquo": '#8221',
            "bdquo": '#8222',
            "dagger": '#8224',
            "Dagger": '#8225',
            "permil": '#8240',
            "lsaquo": '#8249',
            "rsaquo": '#8250',
            "euro": '#8364',
            "rdquo": '#34',
            "lsquo": '#34',
            "rsquo": '#34',
            "half": '#189',
            "ast": '#8727'
            }
        self.inane = {
            "apos": "'",
            "hellip": '...',
            "ldquo": '',
            "lsqb": '[',
            "rsqb": ']',
            "sol": '\\',
            "commat": '@',
            "plus": '+',
            "percnt": '%'
            }

        self.preEntities = {"OUML;" : "Ouml", "UUML" : "Uuml", "AELIG" : "AElig", "Aelig" : "AElig"}
        self.entities = ['nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect', 'uml',    'copy',   'ordf',   'laquo',  'not',    'shy',    'reg',    'macr',   'deg',    'plusmn', 'sup2',   'sup3',   'acute',  'micro',  'para',   'middot', 'cedil',  'sup1',   'ordm',   'raquo',  'frac14', 'frac12', 'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc',  'Atilde', 'Auml',   'Aring',  'AElig',  'Ccedil','Egrave','Eacute','Ecirc', 'Euml',  'Igrave', 'Iacute','Icirc', 'Iuml',  'ETH',   'Ntilde','Ograve','Oacute','Ocirc', 'Otilde','Ouml',  'times', 'Oslash','Ugrave','Uacute','Ucirc', 'Uuml',  'Yacute','THORN', 'szlig', 'agrave','aacute','acirc', 'atilde','auml',  'aring', 'aelig', 'ccedil','egrave','eacute','ecirc', 'euml',  'igrave', 'iacute','icirc', 'iuml',  'eth',   'ntilde','ograve', 'oacute','ocirc', 'otilde','ouml',  'divide','oslash','ugrave','uacute','ucirc', 'uuml',  'yacute','thorn', 'yuml']
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.codec = self.get_setting(session, 'codec', 'utf-8')
Exemple #24
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.amp_re = re.compile('&([^\s;]*)(\s|$)')
     self.entities = {}
Exemple #25
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.normalizer = self.get_path(session, 'normalizer', None)
     if self.normalizer is None:
         msg = "Normalizer for {0} does not exist.".format(self.id)
         raise ConfigFileException(msg)
Exemple #26
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     # Some settings that are needed at this stage
     self.offset = self.get_setting(session, 'termOffset', 0)
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.normalizer = self.get_path(session, 'normalizer', None)
     if self.normalizer == None:
         raise ConfigFileException("Normalizer for %s does not exist." %
                                   self.id)
Exemple #28
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.codec = self.get_setting(session, 'codec', 'utf-8')
Exemple #29
0
    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.numericalEntRe = re.compile('&(\d+);')
        self.fractionRe = re.compile('&frac(\d)(\d);')
        self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);')
        self.start = 160
        self.otherEntities = {
            "quot": '#34',
            "amp": '#38',
            "lt": '#60',
            "gt": '#62',
            "trade": '#8482',
            "OElig": '#338',
            "oelig": '#339',
            "Scaron": '#352',
            "scaron": '#353',
            "Yuml": '#376',
            "circ": '#710',
            "tilde": '#732',
            "ensp": '#8194',
            "emsp": '#8195',
            "thinsp": '#8201',
            "zwnj": '#8204',
            "zwj": '#8205',
            "lrm": '#8206',
            "rlm": '#8207',
            "ndash": '#8211',
            "mdash": '#8212',
            "lsquo": '#8216',
            "rsquo": '#8217',
            "sbquo": '#8218',
            "ldquo": '#8220',
            "rdquo": '#8221',
            "bdquo": '#8222',
            "dagger": '#8224',
            "Dagger": '#8225',
            "permil": '#8240',
            "lsaquo": '#8249',
            "rsaquo": '#8250',
            "euro": '#8364',
            "rdquo": '#34',
            "lsquo": '#34',
            "rsquo": '#34',
            "half": '#189',
            "ast": '#8727'
        }
        self.inane = {
            "apos": "'",
            "hellip": '...',
            "ldquo": '',
            "lsqb": '[',
            "rsqb": ']',
            "sol": '\\',
            "commat": '@',
            "plus": '+',
            "percnt": '%'
        }

        self.preEntities = {
            "OUML;": "Ouml",
            "UUML": "Uuml",
            "AELIG": "AElig",
            "Aelig": "AElig"
        }
        self.entities = [
            'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar',
            'sect', 'uml', 'copy', 'ordf', 'laquo', 'not', 'shy', 'reg',
            'macr', 'deg', 'plusmn', 'sup2', 'sup3', 'acute', 'micro', 'para',
            'middot', 'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12',
            'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc', 'Atilde', 'Auml',
            'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute', 'Ecirc', 'Euml',
            'Igrave', 'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve',
            'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times', 'Oslash', 'Ugrave',
            'Uacute', 'Ucirc', 'Uuml', 'Yacute', 'THORN', 'szlig', 'agrave',
            'aacute', 'acirc', 'atilde', 'auml', 'aring', 'aelig', 'ccedil',
            'egrave', 'eacute', 'ecirc', 'euml', 'igrave', 'iacute', 'icirc',
            'iuml', 'eth', 'ntilde', 'ograve', 'oacute', 'ocirc', 'otilde',
            'ouml', 'divide', 'oslash', 'ugrave', 'uacute', 'ucirc', 'uuml',
            'yacute', 'thorn', 'yuml'
        ]
    def __init__(self, session, config, parent):
        PreParser.__init__(self, session, config, parent)
        self.amp_re = re.compile('&([^\s;]*)(\s|$)')
	self.entities = {}
Exemple #31
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.stem = self.get_setting(session, 'useStem', 0)
     self.pos = self.get_setting(session, 'pos', 0)
     self.puncre = re.compile('[ ]([.,;:?!][ \n])')
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.compressLevel = self.get_setting(session, "compressLevel", 1)
Exemple #33
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.normalizer = self.get_path(session, 'normalizer', None)
     if self.normalizer is None:
         msg = "Normalizer for {0} does not exist.".format(self.id)
         raise ConfigFileException(msg)
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.asciiRe = re.compile('([\x7b-\xff])')
     self.nonxmlRe = re.compile('([\x00-\x08]|[\x0E-\x1F]|[\x0B\x0C\x1F])')
     self.strip = self.get_setting(session, 'strip', 0)
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.inMimeType = self.get_setting(session, 'inMimeType', '')
     self.outMimeType = self.get_setting(session, 'outMimeType', '')
Exemple #36
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.support = self.get_setting(session, 'support', 10.0)
     self.absSupport = self.get_setting(session, 'absoluteSupport', 0)
     self.confidence = self.get_setting(session, 'confidence', 0.0)
Exemple #37
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.inMimeType = self.get_setting(session, 'inMimeType', '')
     self.outMimeType = self.get_setting(session, 'outMimeType', '')
Exemple #38
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.support = self.get_setting(session, 'support', 10.0)
     self.absSupport = self.get_setting(session, 'absoluteSupport', 0)
     self.confidence = self.get_setting(session, 'confidence', 0.0)
Exemple #39
0
 def __init__(self, session, config, parent):
     PreParser.__init__(self, session, config, parent)
     self.compressLevel = self.get_setting(session, "compressLevel", 1)