def _handleLxmlFlow(self, node): code = [] for c in node.iterchildren(tag=etree.Element): n = c.tag[c.tag.find('}') + 1:] if n == "object": code.extend(self._handleLxmlObject(c)) elif n == "assign": try: fro = c.attrib['from'] to = c.attrib['to'] except: raise ConfigFileException("Workflow element assign " "requires 'to' and 'from' " "attributes in %s" % self.id) code.append("%s = %s" % (to, fro)) elif n == "for-each": fcode = self._handleForEach(c) code.extend(fcode) sub = self._handleLxmlFlow(c) if sub: for s in sub: code.append(" " + s) else: code.append(" pass") elif n == "log": code.extend(self._handleLxmlLog(c)) elif n == "try": code.append("try:") sub = self._handleLxmlFlow(c) for s in sub: code.append(" " + s) elif n == "except": code.append("except Exception as err:") sub = self._handleLxmlFlow(c) for s in sub: code.append(" " + s) elif n == "else": code.append("else:") sub = self._handleLxmlFlow(c) for s in sub: code.append(" " + s) elif n == "break": code.append("break") elif n == "continue": code.append("continue") elif n == "return": code.append("return input") elif n == "raise": code.append("raise") elif n == "fork": code.extend(self._handleLxmlFork(c)) else: try: name = n.title() fn = getattr(self, "_handleLxml%s" % name) code.extend(fn(c)) except: raise ConfigFileException("Unknown workflow element: " "%s" % n) return code
def __init__(self, session, node, parent): self.protocol = "http://www.openarchives.org/OAI/2.0/OAI-PMH" self.recordNamespaces = {} # key: metadataPrefix, value: XML Namespace self.schemaLocations = {} # key: XML Namespace, value: Schema Location self.transformerHash = { } # key: XML Namespace, value: Cheshire3 Transformer Object self.contacts = [] ZeerexProtocolMap.__init__(self, session, node, parent) # some validation checks try: self.baseURL = 'http://%s:%d/%s' % (self.host, self.port, self.databaseName) except: raise ConfigFileException( "Unable to derive baseURL from host, port, database") # metadatPrefix oai_dc is mandatory if not 'oai_dc' in self.recordNamespaces: raise ConfigFileException( "Schema configuration for mandatory metadataPrefix 'oai_dc' required in schemaInfo." ) # at least 1 adminEmail address is mandatory for Identify response if not len(self.contacts): raise ConfigFileException( "Contact e-mail address of a database administrator required in databaseInfo." )
def _verifyDatabases(self, session): """Verify Keyspace and ColumnFamilies. Verify existence of Keyspace and ColumnFamilies, creating if necessary. """ try: self._openContainer(session) except pycassa.cassandra.ttypes.InvalidRequestException as e: if e.why == "Keyspace does not exist": # find a way to create keyspace with pycassa.connect('system', servers=self.servers) as cxn: ks_def = pycassa.cassandra.ttypes.KsDef( self.keyspace, strategy_class=('org.apache.cassandra.locator.' 'RackUnawareStrategy'), replication_factor=1, cf_defs=[] ) cxn.add_keyspace(ks_def) self._openContainer(session) else: raise ConfigFileException("Cannot connect to Cassandra: {0!r}" "".format(e.args)) except Exception as e: raise ConfigFileException("Cannot connect to Cassandra: {0!r}" "".format(e.args))
def process_record(self, session, record): u"Extract the attribute, or run the specified function, return data." # Check name against record metadata vals = [] for src in self.sources: # list of {}s for xp in src: name = xp['string'] typ = xp['type'] if typ == 'xpath': # handle old style if hasattr(record, name): vals.append([getattr(record, name)]) elif name == 'now': # eg for lastModified/created etc now = time.strftime("%Y-%m-%d %H:%M:%S") vals.append([now]) else: vals.append(None) elif typ == 'attribute': if hasattr(record, name): vals.append([getattr(record, name)]) elif typ == 'function': if name in ['now', 'now()']: now = time.strftime("%Y-%m-%d %H:%M:%S") vals.append([now]) else: # nothing else defined? raise ConfigFileException("Unknown function: " "%s" % name) else: raise ConfigFileException("Unknown metadata selector type:" " %s" % typ) return vals
def _handleLxmlConfigNode(self, session, node): if node.tag in self.simpleNodes: setattr(self, node.tag[node.tag.find('}') + 1:], flattenTexts(node).strip()) elif node.tag in ["flags", '{%s}flags' % CONFIG_NS]: # Extract Rights info # <flags> <flag> <object> <value> </flag> </flags> for c in node.iterchildren(tag=etree.Element): if c.tag in ["flag", '{%s}flag' % CONFIG_NS]: obj = None flag = None for c2 in c.iterchildren(tag=etree.Element): if c2.tag in ["object", '{%s}object' % CONFIG_NS]: obj = flattenTexts(c2).strip() elif c2.tag in ["value", '{%s}value' % CONFIG_NS]: flag = flattenTexts(c2).strip() if (flag not in self.allFlags and flag[:4] != "c3fn"): msg = "Unknown flag: %s" % flag raise ConfigFileException(msg) if obj is None or flag is None: msg = ("Missing object or value element for flag for " "user %s" % self.username) raise ConfigFileException() f = self.flags.get(flag, []) if (obj): f.append(obj) self.flags[flag] = f elif node.tag in ["history", '{%s}history' % CONFIG_NS]: # Extract user history pass elif node.tag in ["hostmask", '{%s}hostmask' % CONFIG_NS]: # Extract allowed hostmask list pass
def __init__(self, session, config, parent): self.sources = [] SimpleSelector.__init__(self, session, config, parent) try: if len(self.sources[0]) != 2: raise ConfigFileException("SpanXPathSelector '{0}' requires " "exactly two XPaths".format(self.id)) except IndexError: raise ConfigFileException("SpanXPathSelector '{0}' requires " "exactly 1 <source>".format(self.id))
def __init__(self, session, server, config): PreParser.__init__(self, session, server, config) self.source_re = re.compile("<open file '(.+?)', mode '.' at .*?>") # get settings from config # Az: Check existence of settings and fail consistently rather than # die half way through execution self.mvServerPath = self.get_path(session, 'mvServerPath') if self.mvServerPath: # they've specified a local path to the server code # we should start a server locally with automatically generated port, in local-only mode if not os.path.exists(self.mvServerPath): raise ConfigFileException('Path type="mvServerPath" does not exist') host = '127.0.0.1' # find a random free port s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) err = True while (err): err = False port = random.randrange(10000) try: s.bind((host,port)) except: err = True s.close() del s mvStdin, mvStdout = os.popen2('java -D64 -Djava.awt.headless=true -Xms40m -Xmx256m -jar %s %d -guess -out xml -link' % (self.mvServerPath, port), 't') else: # get settings for remote mv server host = self.get_setting(session, 'host') port = self.get_setting(session, 'port') if not port.isdigit(): raise ConfigFileException("'port' setting for Multivalent preParser must be an integer.") pack = self.get_setting(session, 'returnPacking') if not (host and port and pack): raise ConfigFileException("'host', 'port' and 'returnPacking' settings must be set for Multivalent preParser '%s'" % self.id) self.mvHost = host self.mvPort = int(port) self.returnPacking = pack.lower() if (self.returnPacking == 'xml'): self.outMimeType = 'text/xml' else: self.outMimeType = 'text/plain' # initialise and connect to multivalent client self.mvClient = MultivalentClient() try: self.mvClient.connect(self.mvHost, self.mvPort) except: # (Try to connect at run time) pass atexit.register(self.close_mvServer)
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) lang = self.get_setting(session, 'language', 'english') try: self.stemmer = Stemmer.Stemmer(lang) except: raise(ConfigFileException("Unknown stemmer language: %s" % (lang)))
def _processPath(self, session, path): fp = self.get_path(session, path) if fp is None: raise ConfigFileException("No {0} file specified for object with id '{1}'.".format(path, self.id)) if (not os.path.isabs(fp)): dfp = self.get_path(session, "defaultPath") fp = os.path.join(dfp, fp) try: fh = open(fp, 'r') except IOError as e: raise ConfigFileException("{0} for object with id '{1}'.".format(str(e), self.id)) l = fh.readlines() fh.close() return l
def makeObjectFromDom(session, topNode, parentObject): # Lots of indirections from xml to object objectType = None try: objectType = topNode.xpath('./objectType/text()')[0] except IndexError: # May have namespace try: objectType = topNode.xpath('./c3:objectType/text()', namespaces={'c3': CONFIG_NS})[0] except IndexError: from lxml import etree print etree.tostring(topNode) except AttributeError: # Not an Lxml config node for c in topNode.childNodes: if (c.nodeType == elementType and c.localName == "objectType"): # Here's what we want to instantiate objectType = getFirstData(c) break if objectType is None: raise (ConfigFileException('No objectType set in config file.')) else: objectType = objectType.strip() return buildObject(session, objectType, [topNode, parentObject])
def _walkZeeRex(self, session, node): if node.localName in ['indexInfo']: # Ignore return elif node.localName == 'serverInfo': self.version = node.getAttribute('version') for c in node.childNodes: self._walkZeeRex(session, c) elif node.localName == 'database': self.databaseName = str(flattenTexts(node)) elif node.localName == 'host': self.host = str(flattenTexts(node)) elif node.localName == 'port': self.port = int(flattenTexts(node)) elif node.localName == 'title': self.title = str(flattenTexts(node)) elif node.localName == 'contact': self.contacts.append(str(flattenTexts(node))) elif node.localName == 'schema': id = node.getAttribute('identifier') location = node.getAttribute('location') name = node.getAttribute('name') txrid = node.getAttributeNS(self.c3Namespace, 'transformer') if (txrid): txr = self.get_object(session, txrid) if (txr is None): raise ConfigFileException( "No transformer to map to for %s" % (txrid)) self.transformerHash[id] = txr self.recordNamespaces[name] = id self.schemaLocations[id] = location else: for c in node.childNodes: if c.nodeType == elementType: self._walkZeeRex(session, c)
def __init__(self, session, config, parent): PreParser.__init__(self, session, config, parent) self.sumType = self.get_setting(session, 'sumType', 'md5') try: hashlib.new(self.sumType) except ValueError as e: raise ConfigFileException(str(e))
def _cacheIndexes(self, session): storeList = self.get_path(session, 'indexStoreList') if not storeList: indexStore = self.get_path(session, 'indexStore') if not indexStore: msg = ("No indexStore/indexStoreList associated with " "database: %s" % self.id) raise ConfigFileException(msg) storeList = [indexStore.id] else: storeList = storeList.split(' ') for (id, dom) in self.indexConfigs.iteritems(): # see if index should be built if hasattr(dom, 'childNodes'): for c in dom.childNodes: if c.nodeType == 1 and c.localName == 'paths': for c2 in c.childNodes: if c2.nodeType == 1 and c2.localName == 'object': istore = c2.getAttributeNS(None, 'ref') if istore in storeList: o = self.get_object(session, id) self.indexes[id] = o else: for c in dom.iterchildren(tag=etree.Element): if c.tag in ['paths', '{%s}paths' % CONFIG_NS]: for c2 in c.iterchildren(tag=etree.Element): if c2.tag in ['object', '{%s}object' % CONFIG_NS]: istore = c2.attrib.get( 'ref', c2.attrib.get('{%s}ref' % CONFIG_NS, '')) if istore in storeList: o = self.get_object(session, id) self.indexes[id] = o
def __init__(self, session, node, parent): self.unparsedOutput = self.get_setting(session, 'parseOutput', 0) tp = self.get_path(session, 'executablePath', '') exe = self.get_path(session, 'executable', 'geniatagger') if not tp: tp = getShellResult('which %s' % exe) tp = os.path.dirname(tp) tpe = os.path.join(tp, exe) if not tp: raise ConfigFileException("%s requires the path: " "executablePath" % self.id) o = os.getcwd() os.chdir(tp) if self.get_setting(session, 'tokenize', 0): cmd = exe else: cmd = "%s -nt" % exe self.pipe = Popen(cmd, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE) l = "" while l != 'loading named_entity_models..done.\n': l = self.pipe.stderr.readline() os.chdir(o)
def _handleLocationNode(self, session, child): data = {'maps': {}, 'string': '', 'type': ''} xp = getFirstData(child) data['string'] = xp if child.localName == 'xpath': data['type'] = 'xpath' else: try: data['type'] = child.getAttribute('type').lower() except: raise ConfigFileException("Location element in {0} must have " "'type' attribute".format(self.id)) if data['type'] == 'xpath': for a in child.attributes.keys(): # ConfigStore using 4Suite if type(a) == tuple: attrNode = child.attributes[a] a = attrNode.name if (a[:6] == "xmlns:"): pref = a[6:] uri = child.getAttributeNS('http://www.w3.org/2000/xmlns/', pref) if not uri: uri = child.getAttribute(a) data['maps'][pref] = uri else: data[a] = child.getAttributeNS(None, a) return data
def __init__(self, session, node, parent): SimpleExtractor.__init__(self, session, node, parent) if nltk is None: raise MissingDependencyException(self.objectType, 'nltk') # Load types from config types = self.get_setting(session, 'entityTypes') if types: self.types = [] for type_ in types.split(): type_ = type_.lower() if type_.startswith('pe'): self.types.append('PERSON') elif type_.startswith(('pl', 'g')): self.types.append('GPE') elif type_.startswith(('org', 'co')): self.types.append('ORGANIZATION') else: msg = ("Unknown entity type setting {0} on {1} {2}" "".format(type_, self.__class__.__name__, self.id)) raise ConfigFileException(msg) else: # Default to all self.types = ['PERSON', 'GPE', 'ORGANIZATION'] # Should we keep the /POS tag or strip it self.keepPos = self.get_setting(session, 'pos', 0)
def __init__(self, session, config, parent): Transformer.__init__(self, session, config, parent) xfrPath = self.get_path(session, "xsltPath") if xfrPath is None: raise ConfigFileException("Missing path 'xsltPath' for " "{0}.".format(self.id)) if os.path.isabs(xfrPath): path = xfrPath else: dfp = self.get_path(session, "defaultPath") path = os.path.join(dfp, xfrPath) ns = etree.FunctionNamespace( 'http://www.cheshire3.org/ns/function/xsl/') ns['now'] = myTimeFn self.functionNamespace = ns self.parsedXslt = etree.parse(path) self.txr = etree.XSLT(self.parsedXslt) self.params = None parameter = self.get_setting(session, 'parameter', None) if (parameter): self.params = {} kv = parameter.split(' ') for pair in kv: (k, v) = pair.split(':') self.params[k] = '"%s"' % v
def __init__(self, session, config, parent): Fimi1PreParser.__init__(self, session, config, parent) self.minRules = self.get_setting(session, 'minRules', -1) self.minFIS = self.get_setting(session, 'minItemsets', -1) if self.minRules > 0 and self.confidence <= 0: raise ConfigFileException("minRules setting not allowed without " "confidence setting on %s" % (self.id))
def __init__(self, session, config, parent): Logger.__init__(self, session, config, parent) loggerList = self.get_path(session, 'loggerList') if loggerList is None: raise ConfigFileException("Missing path 'loggerList' for " "{0}.".format(self.id)) getObj = self.parent.get_object self.loggers = [getObj(session, id_) for id_ in loggerList.split(' ')]
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) lang = self.get_setting(session, 'language', 'english') self.punctuationRe = re.compile("((?<!s)'|[-.,]((?=\s)|$)|(^|(?<=\s))[-.,']|[~`!@+=\#\&\^*()\[\]{}\\\|\":;<>?/])") try: self.stemmer = Stemmer.Stemmer(lang) except: raise(ConfigFileException("Unknown stemmer language: %s" % (lang)))
def _handleLxmlObject(self, node): ref = node.attrib.get('ref', '') try: typ = node.attrib['type'] except KeyError: raise ConfigFileException("Workflow element 'object' requires attribute 'type' in %s" % self.id) function = node.get('function', '') return self._handleAnonObject(ref, typ, function)
def _handleAnonObject(self, ref, typ, function): code = [] if (ref): self.objrefs.add(ref) o = "self.objcache['%s']" % ref elif typ == 'database': o = "self.database" elif typ == 'input': o = "input" elif typ: code.append("obj = self.database.get_path(session, '%s')" % typ) o = "obj" else: raise ConfigFileException("Could not determine object") if not function: # Assume most common for object type try: function = self.fnHash[typ] except KeyError: raise ConfigFileException("No default function for " "objectType: %s" % typ) if (function in self.singleFunctions): code.append('%s.%s(session)' % (o, function)) elif (function in self.singleInputFunctions): code.append('input = %s.%s(session)' % (o, function)) elif (typ == 'index' and function == 'store_terms'): code.append('%s.store_terms(session, input, inRecord)' % o) elif typ == 'documentFactory' and function == 'load' and input is None: code.append('input = %s.load(session)' % o) elif typ == 'documentStore': # Check for normalizer output code.append('if type(input) == {}.__class__:') code.append(' for k in input.keys():') code.append(' %s.%s(session, k)' % (o, function)) code.append('else:') code.append(' %s.%s(session, input)' % (o, function)) elif typ == 'xpathProcessor': code.append('global inRecord') code.append('inRecord = input') code.append('input = %s.process_record(session, input)' % o) else: code.append('result = %s.%s(session, input)' % (o, function)) code.append('if result is not None:') code.append(' input = result') return code
def __init__(self, session, config, parent): ARMPreParser.__init__(self, session, config, parent) # Check we know where TFP is etc self.filePath = self.get_path(session, 'filePath', None) if not self.filePath: raise ConfigFileException("%s requires the path: filePath" "" % self.id) self.java = self.get_path(session, 'javaPath', 'java') self.memory = self.get_setting(session, 'memory', 1000)
def resolvePrefix(self, name): if (name in self.prefixes): return self.prefixes[name] elif not name: # Look for default if not hasattr(self, 'defaultContextSet'): raise ConfigFileException('Zeerex does not have default ' 'context set.') default = self.defaultContextSet if (default in self.prefixes): return self.prefixes[default] else: return default elif (name == 'c3'): return 'http://www.cheshire3.org/cql-context-set/internal' else: # YYY: Should step up to other config objects? raise(ConfigFileException("Unknown prefix: %s" % (name)))
def __init__(self, session, config, parent): SimpleNormalizer.__init__(self, session, config, parent) self.char = self.get_setting(session, 'char', '') self.keep = self.get_setting(session, 'keep', 0) regex = self.get_setting(session, 'regexp') if regex: self.regexp = re.compile(regex) else: raise ConfigFileException('Missing regexp setting for %s.' % (self.id))
def _handleFlow(self, node): code = [] for c in node.childNodes: if c.nodeType == elementType: n = c.localName if n == "try": code.append("try:") sub = self._handleFlow(c) for s in sub: code.append(" " + s) elif n == "except": code.append("except Exception as err:") sub = self._handleFlow(c) for s in sub: code.append(" " + s) elif n == "else": code.append("else:") sub = self._handleFlow(c) for s in sub: code.append(" " + s) elif n == "break": code.append("break") elif n == "continue": code.append("continue") elif n == "return": code.append("return input") elif n == "raise": code.append("raise") elif n == "assign": fro = c.getAttributeNS(None, 'from') to = c.getAttributeNS(None, 'to') code.append("%s = %s" % (to, fro)) elif n == "for-each": fcode = self._handleForEach(c) code.extend(fcode) sub = self._handleFlow(c) if sub: for s in sub: code.append(" " + s) else: code.append(" pass") elif n == "object": code.extend(self._handleObject(c)) elif n == "log": code.extend(self._handleLog(c)) elif n == "fork": code.extend(self._handleFork(c)) else: try: name = n.title() fn = getattr(self, "_handle%s" % name) code.extend(fn(c)) except: raise ConfigFileException("Unknown workflow element: " "%s" % n) return code
def __init__(self, session, stream): # Check for dependency if irods is None: raise MissingDependencyException( '{0.__module__}.{0.__class__.__name__}'.format(self), 'irods (PyRods)' ) # Check for URL if stream.startswith(('irods://', 'rods://')): myEnv = parse_irodsUrl(stream) stream = myEnv.relpath else: # Get parameters from env status, myEnv = irods.getRodsEnv() try: host = myEnv.getRodsHost() port = myEnv.getRodsPort() username = myEnv.getRodsUserName() zone = myEnv.getRodsZone() home = myEnv.getRodsHome() except AttributeError: host = myEnv.rodsHost port = myEnv.rodsPort username = myEnv.rodsUserName zone = myEnv.rodsZone home = myEnv.rodsHome conn, errMsg = irods.rcConnect(host, port, username, zone) status = irods.clientLogin(conn) if status: raise ConfigFileException("Cannot connect to iRODS: ({0}) {1}" "".format(status, errMsg) ) c = irods.irodsCollection(conn) self.cxn = conn self.coll = c instream = stream # Check if abs path to home dir if stream.startswith(home): stream = stream[len(home):] if stream[0] == "/": stream = stream[1:] colls = stream.split('/') for i, cln in enumerate(colls): exit_status = c.openCollection(cln) if exit_status < 0: if ( (i < len(colls) - 1) or (cln not in [obj[0] for obj in c.getObjects()]) ): raise IOError("When opening {0}: {1} does not exists in " "collection {2}".format(instream, cln, c.getCollName() ) )
def _open(self, session): if self.cxn == None: # connect to iRODS myEnv, status = irods.getRodsEnv() host = self.host if self.host else myEnv.getRodsHost() port = self.port if self.port else myEnv.getRodsPort() user = self.user if self.user else myEnv.getRodsUserName() zone = self.zone if self.zone else myEnv.getRodsZone() conn, errMsg = irods.rcConnect(host, port, user, zone) if self.passwd: status = irods.clientLoginWithPassword(conn, self.passwd) else: status = irods.clientLogin(conn) if status: raise ConfigFileException("Cannot connect to iRODS: (%s) %s" % (status, errMsg.getMsg())) self.cxn = conn self.env = myEnv resources = irods.getResources(self.cxn) self.resourceHash = {} for r in resources: self.resourceHash[r.getName()] = r if self.coll != None: # already open, just skip return None c = irods.irodsCollection(self.cxn, self.env.getRodsHome()) self.coll = c # move into cheshire3 section path = self.get_path(session, 'irodsCollection', 'cheshire3') dirs = c.getSubCollections() if not path in dirs: c.createCollection(path) c.openCollection(path) if self.get_setting(session, 'createSubDir', 1): # now look for object's storage area # maybe move into database collection if (isinstance(self.parent, Database)): sc = self.parent.id dirs = c.getSubCollections() if not sc in dirs: c.createCollection(sc) c.openCollection(sc) # move into store collection dirs = c.getSubCollections() if not self.id in dirs: c.createCollection(self.id) c.openCollection(self.id)
def _connect(self, session): try: cxn = psycopg2.connect("dbname={0}".format(self.database)) except psycopg2.OperationalError as e: raise ConfigFileException("Cannot connect to Postgres: %r" % e.args) yield cxn # Commit transactions cxn.commit() cxn.close()
def begin_indexing(self, session, index): if not self.tempChunks: return BdbIndexStore.begin_indexing(self, session, index) temp = self.get_path(session, 'tempPath') if not os.path.isabs(temp): temp = os.path.join(self.get_path(session, 'defaultPath'), temp) self.tempPath = temp if (not os.path.exists(temp)): try: os.mkdir(temp) except: raise (ConfigFileException( 'TempPath does not exist and is ' 'not creatable.')) elif (not os.path.isdir(temp)): raise (ConfigFileException('TempPath is not a directory.')) # Make temp files on demand, in hash self.outFiles[index] = {}