def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.sumType = self.get_setting(session, 'sumType', 'md5') try: hashlib.new(self.sumType) except ValueError as e: raise ConfigFileException(str(e))
def __init__(self, session, node, parent): PreParser.__init__(self, session, node, parent) tp = self.get_path(session, 'executablePath', '') exe = self.get_path(session, 'executable', './parser') if not tp: tp = commands.getoutput('which %s' % exe) tp = os.path.dirname(tp) tp = os.path.join(tp, exe) if not tp: raise ConfigFileException("%s requires the path: filePath" % self.id) o = os.getcwd() os.chdir(tp) o = os.getcwd() os.chdir(tp) self.pipe = Popen(exe, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE) os.chdir(o)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.body = re.compile('<body(.*?)</body>', re.S | re.I) self.tagstrip = re.compile('<[^>]+>') self.title = re.compile('<title[^>]*>(.+?)</title>', re.S | re.I) self.script = re.compile('<script(.*?)</script>', re.S | re.I) self.style = re.compile('<style(.*?)</style>', re.S | re.I) self.comment = re.compile('<!--(.*?)-->', re.S | re.I)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.doctype_re = (re.compile('<!DOCTYPE\s+?(.+?)["\'](.+?)["\']>')) self.attr_re = re.compile(' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])') self.pi_re = re.compile("<\?(.*?)\?>") self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)') self.amp_re = re.compile('&(\s)') taglist = self.get_setting(session, 'emptyElements') if taglist: self.emptyTags = taglist.split()
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.doctype_re = (re.compile('<!DOCTYPE\s+?(.+?)["\'](.+?)["\']>')) self.attr_re = re.compile( ' ([a-zA-Z0-9_]+)[ ]*=[ ]*([-:_.a-zA-Z0-9]+)([ >])') self.pi_re = re.compile("<\?(.*?)\?>") self.elem_re = re.compile('(<[/]?)([a-zA-Z0-9_]+)') self.amp_re = re.compile('&(\s)') taglist = self.get_setting(session, 'emptyElements') if taglist: self.emptyTags = taglist.split()
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) char = self.get_setting(session, 'char') regex = self.get_setting(session, 'regexp') self.keep = self.get_setting(session, 'keep') if regex: self.regexp = re.compile(regex, re.S) if char: self.char = char else: self.char = ''
def __init__(self, session, server, config): PreParser.__init__(self, session, server, config) self.source_re = re.compile("<open file '(.+?)', mode '.' at .*?>") # get settings from config # Az: Check existence of settings and fail consistently rather than # die half way through execution self.mvServerPath = self.get_path(session, 'mvServerPath') if self.mvServerPath: # they've specified a local path to the server code # we should start a server locally with automatically generated port, in local-only mode if not os.path.exists(self.mvServerPath): raise ConfigFileException('Path type="mvServerPath" does not exist') host = '127.0.0.1' # find a random free port s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) err = True while (err): err = False port = random.randrange(10000) try: s.bind((host,port)) except: err = True s.close() del s mvStdin, mvStdout = os.popen2('java -D64 -Djava.awt.headless=true -Xms40m -Xmx256m -jar %s %d -guess -out xml -link' % (self.mvServerPath, port), 't') else: # get settings for remote mv server host = self.get_setting(session, 'host') port = self.get_setting(session, 'port') if not port.isdigit(): raise ConfigFileException("'port' setting for Multivalent preParser must be an integer.") pack = self.get_setting(session, 'returnPacking') if not (host and port and pack): raise ConfigFileException("'host', 'port' and 'returnPacking' settings must be set for Multivalent preParser '%s'" % self.id) self.mvHost = host self.mvPort = int(port) self.returnPacking = pack.lower() if (self.returnPacking == 'xml'): self.outMimeType = 'text/xml' else: self.outMimeType = 'text/plain' # initialise and connect to multivalent client self.mvClient = MultivalentClient() try: self.mvClient.connect(self.mvHost, self.mvPort) except: # (Try to connect at run time) pass atexit.register(self.close_mvServer)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) # Some settings that are needed at this stage self.offset = self.get_setting(session, 'termOffset', 0) filename = self.get_path(session, 'modelPath', None) if not filename: dfp = self.get_path(session, 'defaultPath') filename = os.path.join(dfp, self.id + "_ATTRHASH.pickle") self.modelPath = filename self.model = {} self.lastModTime = 0 self.load_model(session)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.offset = self.get_setting(session, 'termOffset', 0) modelPath = self.get_path(session, 'modelPath', '') if not modelPath: raise ConfigFileException("Classification PreParser (%s) does not have a modelPath" % self.id) if (not os.path.isabs(modelPath)): dfp = self.get_path(session, 'defaultPath') modelPath = os.path.join(dfp, modelPath) self.paths['modelPath'] = modelPath if os.path.exists(modelPath): # load model self.load_model(session, modelPath) else: self.model = None self.renumber = {}
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.offset = self.get_setting(session, 'termOffset', 0) modelPath = self.get_path(session, 'modelPath', '') if not modelPath: raise ConfigFileException("Classification PreParser (%s) does not " "have a modelPath" % self.id) if (not os.path.isabs(modelPath)): dfp = self.get_path(session, 'defaultPath') modelPath = os.path.join(dfp, modelPath) self.paths['modelPath'] = modelPath if os.path.exists(modelPath): # load model self.load_model(session, modelPath) else: self.model = None self.renumber = {}
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) # need to know which unrenumber preParser to use self.unrenumber = self.get_path(session, 'unRenumberPreParser', None) self.recordStore = self.get_path(session, 'recordStore', None) self.calcRuleLengths = self.get_setting(session, 'calcRuleLengths', 0) self.index = self.get_path(session, 'index', None) self.calcRankings = self.get_setting(session, 'calcRankings', 0) self.sortBy = self.get_setting(session, 'sortBy', '') self.sortFuncs = { 'll': lambda x: x.ll, 'surprise': lambda x: x.surprise, 'entropy': lambda x: x.entropy, 'gini': lambda x: x.gini, 'length': lambda x: len(x.termids), 'support': lambda x: x.freq, 'totalFreq': lambda x: sum(x.freqs) }
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) # need to know which unrenumber preParser to use self.unrenumber = self.get_path(session, 'unRenumberPreParser', None) self.recordStore = self.get_path(session, 'recordStore', None) self.calcRuleLengths = self.get_setting(session, 'calcRuleLengths', 0) self.index = self.get_path(session, 'index', None) self.calcRankings = self.get_setting(session, 'calcRankings', 0) self.sortBy = self.get_setting(session, 'sortBy', '') self.sortFuncs = { 'll' :lambda x: x.ll, 'surprise' : lambda x: x.surprise, 'entropy' : lambda x: x.entropy, 'gini' : lambda x: x.gini, 'length' : lambda x: len(x.termids), 'support' : lambda x: x.freq, 'totalFreq' : lambda x: sum(x.freqs) }
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) # Some settings that are needed at this stage self.offset = self.get_setting(session, 'termOffset', 0)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.normalizer = self.get_path(session, 'normalizer', None) if self.normalizer == None: raise ConfigFileException("Normalizer for %s does not exist." % self.id)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.asciiRe = re.compile('([\x7b-\xff])') self.nonxmlRe = re.compile('([\x00-\x08]|[\x0E-\x1F]|[\x0B\x0C\x1F])') self.strip = self.get_setting(session, 'strip', 0)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.stem = self.get_setting(session, 'useStem', 0) self.pos = self.get_setting(session, 'pos', 0) self.puncre = re.compile('[ ]([.,;:?!][ \n])')
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.numericalEntRe = re.compile('&(\d+);') self.fractionRe = re.compile('&frac(\d)(\d);') self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);') self.start = 160 self.otherEntities = { "quot": '#34', "amp": '#38', "lt": '#60', "gt": '#62', "trade" : '#8482', "OElig": '#338', "oelig": '#339', "Scaron": '#352', "scaron": '#353', "Yuml": '#376', "circ": '#710', "tilde": '#732', "ensp": '#8194', "emsp": '#8195', "thinsp": '#8201', "zwnj": '#8204', "zwj": '#8205', "lrm": '#8206', "rlm": '#8207', "ndash": '#8211', "mdash": '#8212', "lsquo": '#8216', "rsquo": '#8217', "sbquo": '#8218', "ldquo": '#8220', "rdquo": '#8221', "bdquo": '#8222', "dagger": '#8224', "Dagger": '#8225', "permil": '#8240', "lsaquo": '#8249', "rsaquo": '#8250', "euro": '#8364', "rdquo": '#34', "lsquo": '#34', "rsquo": '#34', "half": '#189', "ast": '#8727' } self.inane = { "apos": "'", "hellip": '...', "ldquo": '', "lsqb": '[', "rsqb": ']', "sol": '\\', "commat": '@', "plus": '+', "percnt": '%' } self.preEntities = {"OUML;" : "Ouml", "UUML" : "Uuml", "AELIG" : "AElig", "Aelig" : "AElig"} self.entities = ['nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo', 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn', 'sup2', 'sup3', 'acute', 'micro', 'para', 'middot', 'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12', 'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc', 'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil','Egrave','Eacute','Ecirc', 'Euml', 'Igrave', 'Iacute','Icirc', 'Iuml', 'ETH', 'Ntilde','Ograve','Oacute','Ocirc', 'Otilde','Ouml', 'times', 'Oslash','Ugrave','Uacute','Ucirc', 'Uuml', 'Yacute','THORN', 'szlig', 'agrave','aacute','acirc', 'atilde','auml', 'aring', 'aelig', 'ccedil','egrave','eacute','ecirc', 'euml', 'igrave', 'iacute','icirc', 'iuml', 'eth', 'ntilde','ograve', 'oacute','ocirc', 'otilde','ouml', 'divide','oslash','ugrave','uacute','ucirc', 'uuml', 'yacute','thorn', 'yuml']
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.codec = self.get_setting(session, 'codec', 'utf-8')
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.amp_re = re.compile('&([^\s;]*)(\s|$)') self.entities = {}
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.normalizer = self.get_path(session, 'normalizer', None) if self.normalizer is None: msg = "Normalizer for {0} does not exist.".format(self.id) raise ConfigFileException(msg)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.numericalEntRe = re.compile('&(\d+);') self.fractionRe = re.compile('&frac(\d)(\d);') self.invalidRe = re.compile('&#(\d|[0-2]\d|3[01]);') self.start = 160 self.otherEntities = { "quot": '#34', "amp": '#38', "lt": '#60', "gt": '#62', "trade": '#8482', "OElig": '#338', "oelig": '#339', "Scaron": '#352', "scaron": '#353', "Yuml": '#376', "circ": '#710', "tilde": '#732', "ensp": '#8194', "emsp": '#8195', "thinsp": '#8201', "zwnj": '#8204', "zwj": '#8205', "lrm": '#8206', "rlm": '#8207', "ndash": '#8211', "mdash": '#8212', "lsquo": '#8216', "rsquo": '#8217', "sbquo": '#8218', "ldquo": '#8220', "rdquo": '#8221', "bdquo": '#8222', "dagger": '#8224', "Dagger": '#8225', "permil": '#8240', "lsaquo": '#8249', "rsaquo": '#8250', "euro": '#8364', "rdquo": '#34', "lsquo": '#34', "rsquo": '#34', "half": '#189', "ast": '#8727' } self.inane = { "apos": "'", "hellip": '...', "ldquo": '', "lsqb": '[', "rsqb": ']', "sol": '\\', "commat": '@', "plus": '+', "percnt": '%' } self.preEntities = { "OUML;": "Ouml", "UUML": "Uuml", "AELIG": "AElig", "Aelig": "AElig" } self.entities = [ 'nbsp', 'iexcl', 'cent', 'pound', 'curren', 'yen', 'brvbar', 'sect', 'uml', 'copy', 'ordf', 'laquo', 'not', 'shy', 'reg', 'macr', 'deg', 'plusmn', 'sup2', 'sup3', 'acute', 'micro', 'para', 'middot', 'cedil', 'sup1', 'ordm', 'raquo', 'frac14', 'frac12', 'frac34', 'iquest', 'Agrave', 'Aacute', 'Acirc', 'Atilde', 'Auml', 'Aring', 'AElig', 'Ccedil', 'Egrave', 'Eacute', 'Ecirc', 'Euml', 'Igrave', 'Iacute', 'Icirc', 'Iuml', 'ETH', 'Ntilde', 'Ograve', 'Oacute', 'Ocirc', 'Otilde', 'Ouml', 'times', 'Oslash', 'Ugrave', 'Uacute', 'Ucirc', 'Uuml', 'Yacute', 'THORN', 'szlig', 'agrave', 'aacute', 'acirc', 'atilde', 'auml', 'aring', 'aelig', 'ccedil', 'egrave', 'eacute', 'ecirc', 'euml', 'igrave', 'iacute', 'icirc', 'iuml', 'eth', 'ntilde', 'ograve', 'oacute', 'ocirc', 'otilde', 'ouml', 'divide', 'oslash', 'ugrave', 'uacute', 'ucirc', 'uuml', 'yacute', 'thorn', 'yuml' ]
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.compressLevel = self.get_setting(session, "compressLevel", 1)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.inMimeType = self.get_setting(session, 'inMimeType', '') self.outMimeType = self.get_setting(session, 'outMimeType', '')
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.support = self.get_setting(session, 'support', 10.0) self.absSupport = self.get_setting(session, 'absoluteSupport', 0) self.confidence = self.get_setting(session, 'confidence', 0.0)