def process_update(self, req): self.version = "1.1" self.operationStatus = "fail" if (not req.version): diag = Diagnostic7() diag.message = "Mandatory 'version' parameter not supplied" diag.details = 'version' raise diag config = req.config db = config.parent req._db = db session = Session() session.environment = "apache" session.database = db.id if req.operation == "info:srw/operation/1/create": # Do Create self.handle_create(session, req) elif req.operation == "info:srw/operation/1/replace": # Do Replace self.handle_replace(session, req) elif req.operation == "info:srw/operation/1/delete": # Do Delete self.handle_delete(session, req) elif req.operation == "info:srw/operation/1/metadata": # Do Metadata update self.handle_metadata(session, req) else: # Barf diag = SRWDiagnostics.Diagnostic1() diag.details = "Unknown operation: %s" % req.operation self.diagnostics = [diag]
def build_architecture(data=None): # data argument provided for when function run as clean-up - always None global session, serv, db, dbPath, docParser, \ fullTxr, fullSplitTxr, \ ppFlow, \ rebuild # globals line 1: re-establish session; maintain user if possible if (session): u = session.user else: u = None session = Session() session.database = 'db_ead' session.environment = 'apache' session.user = u serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml')) db = serv.get_object(session, 'db_ead') dbPath = db.get_path(session, 'defaultPath') docParser = db.get_object(session, 'LxmlParser') # globals line 4: transformers fullTxr = db.get_object(session, 'htmlFullTxr') fullSplitTxr = db.get_object(session, 'htmlFullSplitTxr') # globals line 5: workflows ppFlow = db.get_object(session, 'preParserWorkflow'); ppFlow.load_cache(session, db) rebuild = False
def build_architecture(data=None): global rebuild, session, serv, db, dbPath global editStore, authStore, instStore, userStore, xmlp global docStoreConfigStore session = Session() session.database = 'db_hubedit' session.environment = 'apache' # session.user = None serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml' ) ) db = serv.get_object(session, 'db_hubedit') dbPath = db.get_path(session, 'defaultPath') editStore = db.get_object(session, 'editingStore') userStore = db.get_object(session, 'hubAuthStore') instStore = db.get_object(session, 'institutionStore') docStoreConfigStore = db.get_object(session, 'documentStoreConfigStore') authStore = db.get_object(session, 'adminAuthStore') xmlp = db.get_object(session, 'LxmlParser') rebuild = False
def test_sessionDatabaseAssign(self): session = Session() session.database = "db_test1" self.assertEqual(session.database, "db_test1", "session.database assignment failed") session.database = "db_test2" self.assertEqual(session.database, "db_test2", "session.database re-assignment failed")
def test_sessionEnvironmentAssign(self): session = Session() session.environment = "apache" self.assertEqual(session.environment, "apache", "session.environment assignment failed") session.environment = "terminal" self.assertEqual(session.environment, "terminal", "session.environment re-assignment failed")
def build_architecture(data=None): global session, serv, db, qf, xmlp, recordStore, sentenceStore, paragraphStore, resultSetStore, articleTransformer, kwicTransformer session = Session() session.environment = 'apache' session.user = None serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml') ) session.database = 'db_' + databaseName db = serv.get_object(session, session.database) qf = db.get_object(session, 'defaultQueryFactory') xmlp = db.get_object(session, 'LxmlParser') recordStore = db.get_object(session, 'recordStore') articleTransformer = db.get_object(session, 'article-Txr') kwicTransformer = db.get_object(session, 'kwic-Txr')
def __init__(self, store): self.store = store self.session = Session() self.cxn = store._open(self.session, 'byteCount') self.cursor = self.cxn.cursor() (key, val) = self.cursor.first() self.nextData = (key, self.store.fetch_data(self.session, key))
def __init__(self): self.session = Session() self.session.database = 'db_dickens' self.serv = SimpleServer( self.session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')) self.db = self.serv.get_object(self.session, self.session.database) self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
def build_architecture(data=None): global session, serv, db, qf, xmlp, recordStore, resultSetStore, idxStore, articleTransformer, kwicTransformer, proxExtractor, simpleExtractor, adf, fimi2, rule, arm, vecTxr, vectorStore, armTableTxr session = Session() session.environment = 'apache' session.user = None serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml') ) session.database = 'db_' + databaseName db = serv.get_object(session, session.database) qf = db.get_object(session, 'defaultQueryFactory') xmlp = db.get_object(session, 'LxmlParser') recordStore = db.get_object(session, 'recordStore') resultSetStore = db.get_object(session, 'resultSetStore') simpleExtractor = db.get_object(session, 'SimpleExtractor') proxExtractor = db.get_object(session, 'ProxExtractor') articleTransformer = db.get_object(session, 'article-Txr') kwicTransformer = db.get_object(session, 'kwic-Txr') idxStore = db.get_object(session, 'indexStore')
def setUp(self): self.session = Session() self.records = [] cls = self._get_class() for d in self._get_data(): recHash = {'xml': d, 'record': cls(self._parse_data(d), xml=d, byteCount=len(d)) } self.records.append(recHash)
def __init__(self): ''' Sets up the connection with Cheshire3. ''' self.session = Session() self.session.database = 'db_dickens' self.serv = SimpleServer( self.session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')) self.db = self.serv.get_object(self.session, self.session.database) self.qf = self.db.get_object(self.session, 'defaultQueryFactory') self.resultSetStore = self.db.get_object(self.session, 'resultSetStore') self.idxStore = self.db.get_object(self.session, 'indexStore')
def setUp(self): self.session = Session() serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml') self.server = SimpleServer(self.session, serverConfig) for config in self._get_dependencyConfigs(): identifier = config.get('id') self.server.subConfigs[identifier] = config # Disable stdout logging lgr = self.server.get_path(self.session, 'defaultLogger') lgr.minLevel = 60 # Create object that will be tested config = self._get_config() self.testObj = makeObjectFromDom(self.session, config, self.server)
def setUp(self): self.session = Session() self.testPairs = [ ('application/xml', '<doc><foo/><bar><baz/></baz></doc>', []), ('text/plain', 'This is my document!', ['aProcessingObject']) ] self.testDocs = [] for mt, data, processHistory in self.testPairs: self.testDocs.append( StringDocument(data, mimeType=mt, creator=id(self), history=processHistory, byteCount=len(data), wordCount=len(data.split(' '))))
def getCheshire3Env(args): """Init and return Cheshire3 Session, Server and Database. Intialize Cheshire3 Session, Server and Database objects based on ``args``. """ # Create a Session session = Session() # Get the Server based on given serverConfig file server = SimpleServer(session, args.serverconfig) # Try to get the Database if args.database is None: try: dbid = identify_database(session, os.getcwd()) except EnvironmentError as e: server.log_critical(session, e.message) raise server.log_debug( session, "database identifier not specified, discovered: {0}".format(dbid) ) else: dbid = args.database try: db = server.get_object(session, dbid) except ObjectDoesNotExistException: msg = """Cheshire3 database {0} does not exist. Please provide a different database identifier using the --database option. """.format(dbid) server.log_critical(session, msg) raise else: # Attach a default Logger to the Session session.logger = db.get_path(session, 'defaultLogger') return session, server, db
def __init__(self): ''' Set up a cheshire3 session/connection to the database. This initilisation does not handle the actual search term (cf. build_and_run_query). ''' self.session = Session() self.session.database = 'db_dickens' self.serv = SimpleServer(self.session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml') ) self.db = self.serv.get_object(self.session, self.session.database) self.qf = self.db.get_object(self.session, 'defaultQueryFactory') self.resultSetStore = self.db.get_object(self.session, 'resultSetStore') self.idxStore = self.db.get_object(self.session, 'indexStore')
def setUp(self): """Setup some ResultsetItems and put them into ResultSets to evaluate. N.B. a == b, other pairs should not evaluate as equal """ self.session = session = Session() # Set up same 4 ResultSetItems as for SimpleResultSetItemTestCase self.rsi1 = SimpleResultSetItem(session, id=0, recStore="recordStore", occs=5, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.rsi2 = SimpleResultSetItem(session, id=0, recStore="recordStore", occs=3, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.rsi3 = SimpleResultSetItem(session, id=1, recStore="recordStore", occs=1, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.rsi4 = SimpleResultSetItem(session, id=0, recStore="recordStore2", occs=2, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) # Put identical (rsi1 and rsi2) into separate ResultSets self.a = SimpleResultSet(session, [self.rsi1, self.rsi3], id="a") self.b = SimpleResultSet(session, [self.rsi2, self.rsi4], id="b")
def __init__(self, session, name=None, manager=None, debug=0): # This sets self.name mp.Process.__init__(self, name=name) self.inPipe = None self.debug = debug self.manager = manager # Reconstruct our own session, so as to not overwrite task self.session = Session(user=session.user, logger=session.logger, task=self.name, database=session.database, environment=session.environment) self.session.server = session.server self.server = session.server self.database = self.server.get_object(self.session, session.database) try: name = property(mp.Process.get_name, mp.Process.set_name) except AttributeError: pass
def setUp(self): """Setup some ResultsetItems to evaluate. N.B. a == b, other pairs should not evaluate as equal """ self.session = session = Session() self.a = SimpleResultSetItem(session, id=0, recStore="recordStore", occs=0, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.b = SimpleResultSetItem(session, id=0, recStore="recordStore", occs=0, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.c = SimpleResultSetItem(session, id=1, recStore="recordStore", occs=0, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None) self.d = SimpleResultSetItem(session, id=0, recStore="recordStore2", occs=0, database="", diagnostic=None, weight=0.5, resultSet=None, numeric=None)
def directoryRecordStoreIter(store): session = Session() for id_, data in directoryStoreIter(store): yield store._process_data(session, id_, data)
from mod_python.util import FieldStorage import os, re, time from xml.sax.saxutils import escape from lxml import etree from lxml.builder import ElementMaker from cheshire3.server import SimpleServer from cheshire3.baseObjects import Session from cheshire3.utils import flattenTexts from cheshire3 import cqlParser from cheshire3 import internal from cheshire3 import exceptions as c3errors cheshirePath = os.environ.get('C3HOME', '/home/cheshire') session = Session() session.environment = "apache" serv = SimpleServer( session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml')) configs = {} # determine the root URL of this handler for configitem in apache.config_tree(): if configitem[0] == "DocumentRoot": docRoot = configitem[1].strip("\"'") handlerUrl = apache.get_handler_root().replace(docRoot, "") if len(serv.databaseConfigs) < 25:
def test_sessionEnvironmentDefault(self): session = Session() self.assertEqual(session.environment, "terminal")
def test_sessionEnvironmentInit(self): session = Session(environment="apache") self.assertEqual(session.environment, "apache")
from mod_python.util import FieldStorage import os, re, time from xml.sax.saxutils import escape from lxml import etree from lxml.builder import ElementMaker from cheshire3.server import SimpleServer from cheshire3.baseObjects import Session from cheshire3.utils import flattenTexts from cheshire3 import cqlParser from cheshire3 import internal from cheshire3 import exceptions as c3errors cheshirePath = os.environ.get('C3HOME', '/home/cheshire') session = Session() session.environment = "apache" serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml')) configs = {} # determine the root URL of this handler for configitem in apache.config_tree(): if configitem[0] == "DocumentRoot": docRoot = configitem[1].strip("\"'") handlerUrl = apache.get_handler_root().replace(docRoot, "") if len(serv.databaseConfigs) < 25: # relatively few dbs - we can safely cache them
def test_sessionDatabaseInit(self): session = Session(database="db_test1") self.assertEqual(session.database, "db_test1")
def groupDist(dist): hits = sum(dist.values()) occs=0 for v in dist: occs += int(v) * int(dist[v]) for i in [1,2,3]: print "%s\t%s\t%0.2f" % (i, dist[i], float(dist[i])/float(hits) * 100.0) fourPlus=0 for i in range(4,max(dist.keys())): try: fourPlus += dist[i] except: continue print "4+\t%s\t%0.2f" % (fourPlus, float(fourPlus)/float(hits) * 100.0) print "\n%i occurrences in %i articles" % (occs,hits) session = Session() serv = SimpleServer(session, "../../configs/serverConfig.xml") db = serv.get_object(session, 'db_news') session.database = 'db_news' idxStore = db.get_object(session, 'indexStore') recStore = db.get_object(session, 'recordStore')
def test_sessionInstance(self): session = Session() self.assertIsInstance(session, Session)
# from cheshire3.utils import reader from cheshire3.baseObjects import Session # Apache Config: #<Directory /usr/local/apache2/htdocs/srw> # SetHandler mod_python # PythonDebug On # PythonPath "['/home/cheshire/c3/code', '/usr/local/lib/python2.3/lib-dynload']+sys.path" # PythonHandler srwApacheHandler #</Directory> # NB. SetHandler, not AddHandler. cheshirePath = os.environ.get('C3HOME', '/home/cheshire') session = Session() session.environment = "apache" serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml')) configs = {} serv._cacheDatabases(session) for db in serv.databases.values(): if db.get_setting(session, 'SRW') or db.get_setting(session, 'srw'): db._cacheProtocolMaps(session) map = db.protocolMaps.get('http://www.loc.gov/zing/srw/', None) map2 = db.protocolMaps.get('http://www.loc.gov/zing/srw/update/', None) configs[map.databaseUrl] = {'http://www.loc.gov/zing/srw/' : map, 'http://www.loc.gov/zing/srw/update/' : map2} class reqHandler:
cheshirePath = os.environ.get('C3HOME', '/home/cheshire/') sys.path.insert(1, os.path.join(cheshirePath, 'cheshire3', 'code')) from cheshire3.baseObjects import Session from cheshire3.server import SimpleServer from cheshire3.document import StringDocument from cheshire3 import exceptions as c3errors from cheshire3.web.www_utils import read_file # import customisable variables #from localConfig import * # Build environment... session = Session() serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml')) session.database = 'db_ead' db = serv.get_object(session, 'db_ead') lgr = db.get_path(session, 'defaultLogger') recordStore = db.get_object(session, 'recordStore') authStore = db.get_object(session, 'eadAuthStore') compStore = db.get_object(session, 'componentStore') clusDocFac = db.get_object(session, 'clusterDocumentFactory') clusDb = serv.get_object(session, 'db_ead_cluster') clusRecordStore = clusDb.get_object(session, 'eadClusterStore') xmlp = db.get_object(session, 'LxmlParser')
) parser_list.set_defaults(func=list_users) # Create the parser for the "remove" command parser_remove = subparsers.add_parser('remove', help='Remove an existing user') parser_remove.add_argument('username', type=str, nargs='*', help='Username of the user(s) to remove') parser_remove.set_defaults(func=remove_user) # Build environment... session = Session() serv = SimpleServer( session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml' ) ) session.database = 'db_ead' db = serv.get_object(session, 'db_ead') xmlp = db.get_object(session, 'LxmlParser') authStore = db.get_object(session, 'hubAuthStore') # Editors superAuthStore = db.get_object(session, 'adminAuthStore') # Hub Staff instStore = db.get_object(session, 'institutionStore') # Institutions
from PyZ3950.zdefs import * from PyZ3950 import oids import random rand = random.Random() from PyZ3950 import CQLParser asn1.register_oid(Z3950_QUERY_SQL, SQLQuery) asn1.register_oid(Z3950_QUERY_CQL, asn1.GeneralString) from cheshire3.baseObjects import Session, Database, Transformer, Workflow from cheshire3.server import SimpleServer from cheshire3 import internal from cheshire3 import cqlParser session = Session() session.environment = "apache" server = SimpleServer(session, os.path.join(internal.cheshire3Root, 'configs', 'serverConfig.xml')) configs = {} dbmap = {} server._cacheDatabases(session) for db in server.databases.values(): if db.get_setting(session, "z3950"): db._cacheProtocolMaps(session) map1 = db.protocolMaps.get('http://www.loc.gov/z3950/', None) if map1: configs[map1.databaseName] = map1 dbmap[db.id] = map1.databaseName session.resultSetStore = server.get_path(session, 'resultSetStore') session.logger = server.get_path(session, 'z3950Logger')
from cheshire3.server import SimpleServer # C3 web search utils from cheshire3.web.www_utils import * # separate file containing display configs + some HMTL for table rows etc. from clic.dickens.web.dickensWebConfig import * from clic.dickens.web.dickensSearchHandler import SearchHandler from clic.dickens.web.dickensBrowseHandler import BrowseHandler cheshirePath = os.environ.get('HOME', '/home/cheshire') logPath = os.path.join(cheshirePath, 'clic', 'www', databaseName, 'logs', 'searchHandler.log') htmlPath = os.path.join(cheshirePath, 'clic', 'www', databaseName, 'html') session = Session() session.environment = 'apache' session.user = None serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')) session.database = 'db_dickens' db = serv.get_object(session, session.database) authStore = db.get_object(session, 'authStore') # Discover objects... def handler(req): global db, htmlPath, logPath, cheshirePath, xmlp, recordStore form = FieldStorage(req) try: dir = req.uri[1:].rsplit('/')[1]
raise NoSetHierarchyError() # End Cheshire3OaiServer ------------------------------------------------ def get_databasesAndConfigs(session, serv): """Get and return database and config mappings from Server.""" dbs = {} configs = {} serv._cacheDatabases(session) for db in serv.databases.values(): if db.get_setting(session, 'oai-pmh'): db._cacheProtocolMaps(session) pmap = db.protocolMaps.get( 'http://www.openarchives.org/OAI/2.0/OAI-PMH', None) # Check that there's a path and that it can actually be requested # from this handler if (pmap is not None): configs[pmap.databaseName] = pmap dbs[pmap.databaseName] = db return dbs, configs # Cheshire3 architecture session = Session() serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')) lxmlParser = serv.get_object(session, 'LxmlParser') dbs, configs = get_databasesAndConfigs(session, serv) c3OaiServers = {}
## count words in books, and list titles ## used to create booklist import os import re from lxml import etree import json from cheshire3.document import StringDocument from cheshire3.internal import cheshire3Root from cheshire3.server import SimpleServer from cheshire3.baseObjects import Session session = Session() session.database = 'db_dickens' serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')) db = serv.get_object(session, session.database) qf = db.get_object(session, 'defaultQueryFactory') resultSetStore = db.get_object(session, 'resultSetStore') idxStore = db.get_object(session, 'indexStore') list_books = [ 'BH', 'BR', 'DC', 'DS', 'ED', 'GE', 'HT', 'LD', 'MC', 'NN', 'OCS', 'OMF', 'OT', 'PP', 'TTC', 'AgnesG', 'Antoni', 'arma', 'cran', 'Deronda', 'dracula', 'emma', 'frank', 'jane', 'Jude', 'LadyAud', 'mary', 'NorthS', 'persuasion', 'pride', 'sybil', 'Tess', 'basker', 'Pomp', 'mill', 'dorian', 'Prof', 'native', 'alli', 'Jekyll', 'wwhite', 'vanity', 'VivianG', 'wh' ] titles = {
import random rand = random.Random() from PyZ3950 import CQLParser asn1.register_oid(Z3950_QUERY_SQL, SQLQuery) asn1.register_oid(Z3950_QUERY_CQL, asn1.GeneralString) from cheshire3.baseObjects import Session, Database, Transformer, Workflow from cheshire3.server import SimpleServer from cheshire3 import internal from cheshire3 import cqlParser cheshirePath = os.environ.get('C3HOME', '/home/cheshire') session = Session() session.environment = "apache" server = SimpleServer( session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml')) configs = {} dbmap = {} server._cacheDatabases(session) for db in server.databases.values(): if db.get_setting(session, "z3950"): db._cacheProtocolMaps(session) map1 = db.protocolMaps.get('http://www.loc.gov/z3950/', None) if map1: configs[map1.databaseName] = map1 dbmap[db.id] = map1.databaseName
# from cheshire3.utils import reader from cheshire3.baseObjects import Session # Apache Config: #<Directory /usr/local/apache2/htdocs/srw> # SetHandler mod_python # PythonDebug On # PythonPath "['/home/cheshire/c3/code', '/usr/local/lib/python2.3/lib-dynload']+sys.path" # PythonHandler srwApacheHandler #</Directory> # NB. SetHandler, not AddHandler. cheshirePath = os.environ.get('C3HOME', '/home/cheshire') session = Session() session.environment = "apache" serv = SimpleServer( session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml')) configs = {} serv._cacheDatabases(session) for db in serv.databases.values(): if db.get_setting(session, 'SRW') or db.get_setting(session, 'srw'): db._cacheProtocolMaps(session) map = db.protocolMaps.get('http://www.loc.gov/zing/srw/', None) map2 = db.protocolMaps.get('http://www.loc.gov/zing/srw/update/', None) configs[map.databaseUrl] = { 'http://www.loc.gov/zing/srw/': map, 'http://www.loc.gov/zing/srw/update/': map2
class Cheshire3Engine(BaseEngine): #schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), href=ID(stored=True), cfiBase=TEXT(stored=True), spinePos=TEXT(stored=True), content=TEXT) #database = 'db_tdo_simple_sru' cheshire_metadata_dir = '/cheshire3-metadata' session = Session() serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml') server = SimpleServer(session, serverConfig) queryFactory = None db = None titleSel = None anywhereSel = None proxExtractor = None def __initializeTitleSelector(self): try: self.titleSel = self.db.get_object(self.session, 'titleXPathSelector') except ObjectDoesNotExistException: try: self.titleSel = self.db.get_object(self.session, 'titleSelector') except ObjectDoesNotExistException as e: print e def __initializeAnywhereSelector(self): try: self.anywhereSel = self.db.get_object(self.session, 'anywhereXPathSelector') except ObjectDoesNotExistException as e: print e def __initializeProximityExtractor(self): try: self.proxExtractor = self.db.get_object(self.session, 'ProxExtractor') except ObjectDoesNotExistException as e: print e def __highlight(self, text, term, n): """Searches for text, retrieves n words either side of the text, which are retuned seperately""" term_concordance = list() text_len = len(text) term_len = len(term) term_indexes = [w.start() for w in re.finditer(term, text)] for idx in term_indexes: start = idx - n end = text_len if (idx + term_len + n) > text_len else idx + term_len + n term_concordance.append(text[start:idx] + '<b class="match term0">' + term + '</b>' + text[idx:end]) return term_concordance def open(self): """ The Cheshire get_object line should throw an exception if it can't open passed db """ try: self.db = self.server.get_object(self.session, self.databaseName) self.session.database = self.databaseName except Exception as e: print e print "openning database {} failed".format(self.databaseName) def create(self): if not os.path.exists(self.databasePath): os.makedirs(self.databasePath) # create cheshire metadata directory if needed, then initialize with empty list metadata_path = self.databasePath + self.cheshire_metadata_dir if not os.path.exists(metadata_path): os.makedirs(metadata_path) with open(metadata_path + '/' + self.databaseName, 'w') as f: json.dump({}, f) try: print "openning database {} to create".format(self.databasePath) os.system("cheshire3-init " + self.databasePath + " --database=" + self.databaseName) except Exception, e: print e
#!/usr/bin/python import sys import os from cheshire3.baseObjects import Session from cheshire3.server import SimpleServer from cheshire3.internal import cheshire3Root # Build environment... session = Session() # a Session - used to store print cheshire3Root serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')) session.logger = serv.get_path(session, 'defaultLogger') # a logger db = serv.get_object(session, 'db_tdo_index') # the Database session.database = db.id #qf = db.get_object(session, 'defaultQueryFactory') def testVec(): recordStore = db.get_object(session, 'recordStore') rec = recordStore.fetch_record(session, 1) idx= db.get_object(session, 'idx-topic') vec = idx.fetch_vector(session, rec)
class Cheshire3Engine(BaseEngine): #schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), href=ID(stored=True), cfiBase=TEXT(stored=True), spinePos=TEXT(stored=True), content=TEXT) #database = 'db_tdo_simple_sru' cheshire_metadata_dir = '/cheshire3-metadata' session = Session() serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml') server = SimpleServer(session, serverConfig) queryFactory = None db = None titleSel = None anywhereSel = None proxExtractor = None def __initializeTitleSelector(self): try: self.titleSel = self.db.get_object(self.session, 'titleXPathSelector') except ObjectDoesNotExistException: try: self.titleSel = self.db.get_object(self.session, 'titleSelector') except ObjectDoesNotExistException as e: logging.error(e) def __initializeAnywhereSelector(self): try: self.anywhereSel = self.db.get_object(self.session, 'anywhereXPathSelector') except ObjectDoesNotExistException as e: logging.error(e) def __initializeProximityExtractor(self): try: self.proxExtractor = self.db.get_object(self.session, 'ProxExtractor') except ObjectDoesNotExistException as e: logging.error(e) def __highlight(self, text, term, n): """Searches for text, retrieves n words either side of the text, which are retuned seperately""" term_concordance = list() text_len = len(text) term_len = len(term) term_indexes = [w.start() for w in re.finditer(term, text)] for idx in term_indexes: start = idx - n end = text_len if (idx + term_len + n) > text_len else idx + term_len + n term_concordance.append(text[start:idx] + '<b class="match term0">' + term + '</b>' + text[idx:end]) return term_concordance def open(self): """ The Cheshire get_object line should throw an exception if it can't open passed db """ try: self.db = self.server.get_object(self.session, self.database_name) self.session.database = self.database_name except Exception as e: logging.error(e) logging.error("openning database {} failed".format( self.database_name)) def create(self): if not os.path.exists(self.database_path): os.makedirs(self.database_path) # create cheshire metadata directory if needed, then initialize with empty list metadata_path = self.database_path + self.cheshire_metadata_dir if not os.path.exists(metadata_path): os.makedirs(metadata_path) with open(metadata_path + '/' + self.database_name, 'w') as f: json.dump({}, f) try: logging.info("openning database {} to create".format( self.database_path)) os.system("cheshire3-init " + self.database_path + " --database=" + self.database_name) except Exception as e: logging.error(e) def add(self, path='', href='', title='', cfiBase='', spinePos=''): # first, index the document in cheshire3 using unix commands os.system("cheshire3-load --database=" + self.database_name + ' ' + path) doc_md = dict() doc_md[href] = { 'path': path, 'href': href, 'title': title, 'cfiBase': cfiBase, 'spinePos': spinePos } # title is not populated, so pulling filename from path prefix #filename = path[:path.find('/')] + '.json' metadata_path = self.database_path + self.cheshire_metadata_dir with open(metadata_path + '/' + self.database_name) as f_in: md_dict = json.load(f_in) md_dict.update(doc_md) with open(metadata_path + '/' + self.database_name, 'w') as f_out: json.dump(md_dict, f_out) #print "Current Path for directory writing: " + os.getcwd() def finished(self): """ In Cheshire, there are no cleanup commands that are needed. The add command will index specified documents fully and end, so a finished command is not required. """ pass def query(self, q, limit=None): """ In Cheshire3, you have to specify an index and query, else it defaults the all index which utilizes simple extraction. """ if self.queryFactory == None: self.queryFactory = self.db.get_object(self.session, 'defaultQueryFactory') if self.titleSel is None: self.__initializeTitleSelector() if self.anywhereSel is None: self.__initializeAnywhereSelector() if self.proxExtractor is None: self.__initializeProximityExtractor() c3Query = self.queryFactory.get_query(self.session, q) rs = self.db.search(self.session, c3Query) # open up the json file with reader specific attributes metadata_path = self.database_path + self.cheshire_metadata_dir with open(metadata_path + '/' + self.database_name) as f: db_md_dict = json.load(f) # loop through recordset, create new results list with dictionary of found values results = list() for rsi in rs: rec = rsi.fetch_record(self.session) # check the record titles titleData = self.titleSel.process_record(self.session, rec) # checking out the proximity attributes elems = self.anywhereSel.process_record(self.session, rec) doc_dict = self.proxExtractor.process_xpathResult( self.session, elems).values()[0] concordance = self.__highlight(doc_dict['text'], q, 20) pdb.set_trace() # extracts document name key fn_key = os.path.basename(titleData[3][0]) # append highlighted concordance to the dictionary db_md_dict[fn_key][u'highlight'] = " ".join(concordance) results.append(db_md_dict[fn_key]) return results
elif (o == '--addsuperuser'): return addSuperUser() except UsageError as err: sys.stderr.write(str(err) + '\n') sys.stderr.write("for help use --help\n") sys.stderr.flush() return 2 except Error as e: lgr.log_lvl(session, 40, str(e)) if debug: raise return 1 # Build environment... session = Session() serv = SimpleServer( session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml' ) ) session.database = 'db_hubedit' db = serv.get_object(session, 'db_hubedit') lgr = db.get_path(session, 'defaultLogger') authStore = db.get_object(session, 'hubAuthStore') superAuthStore = db.get_object(session, 'hubSuperAuthStore') xmlp = db.get_object(session, 'LxmlParser')
import getpass import os import sys import traceback from lxml import etree from crypt import crypt import cheshire3 from cheshire3.baseObjects import Session from cheshire3.server import SimpleServer from cheshire3.internal import cheshire3Root from cheshire3.document import StringDocument session = Session() serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml') serv = SimpleServer(session, serverConfig) db = serv.get_object(session, 'db_dickens') session.database = 'db_dickens' qf = db.get_object(session, 'defaultQueryFactory') df = db.get_object(session, 'SimpleDocumentFactory') concStore = db.get_object(session, 'concordanceStore') authStore = db.get_object(session, 'authStore') recStore = db.get_object(session, 'recordStore')