Exemple #1
0
def main(argv=None):
    """Start up a CherryPy server to serve the SRU, OAI-PMH applications."""
    global argparser, c3_session, c3_server
    global sru_app, oaipmh_app  # WSGI Apps
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    c3_session = Session()
    c3_server = SimpleServer(c3_session, args.serverconfig)
    # Init SRU App
    sru_configs = get_configsFromServer(c3_session, c3_server)
    sru_app = SRUWsgiHandler(c3_session, sru_configs)
    # Init OAI-PMH App
    dbs, oaipmh_configs = get_databasesAndConfigs(c3_session, c3_server)
    oaipmh_app = OAIPMHWsgiApplication(c3_session, oaipmh_configs, dbs)
    # Mount various Apps and static directories
    urlmap = URLMap()
    urlmap['/docs'] = make_pkg_resources(None, 'cheshire3', 'docs/build/html')
    urlmap['/api/sru'] = sru_app
    urlmap['/api/oaipmh/2.0'] = oaipmh_app
    url = "http://{0}:{1}/".format(args.hostname, args.port)
    if args.browser:
        webbrowser.open(url)
        print("Hopefully a new browser window/tab should have opened "
              "displaying the application.")
    paste.httpserver.serve(
        urlmap,
        host=args.hostname,
        port=args.port,
    )
Exemple #2
0
 def __init__(self):
     self.session = Session()
     self.session.database = 'db_dickens'
     self.serv = SimpleServer(
         self.session,
         os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
     self.db = self.serv.get_object(self.session, self.session.database)
     self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
Exemple #3
0
 def setUp(self):
     self.session = baseObjects.Session()
     serverConfig = os.path.join(cheshire3Root, 'configs',
                                 'serverConfig.xml')
     self.server = SimpleServer(self.session, serverConfig)
     # Disable stdout logging
     lgr = self.server.get_path(self.session, 'defaultLogger')
     lgr.minLevel = 60
Exemple #4
0
def main(argv=None):
    """Unregister a Database from the Cheshire3 Server."""
    global argparser, session, server
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    # Tell the server to unregister the Database
    server.unregister_databaseConfig(session, args.identifier)
    return 0
Exemple #5
0
def main(argv=None):
    """Load data into a Cheshire3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid)
        )
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        # Allow for multiple data arguments
        docFac = db.get_object(session, 'defaultDocumentFactory')
        for dataArg in args.data:
            try:
                docFac.load(session,
                            dataArg,
                            args.cache,
                            args.format,
                            args.tagname,
                            args.codec
                            )
            except MissingDependencyException as e:
                server.log_critical(session, e.reason)
                missingDependencies = e.dependencies
                raise MissingDependencyException('cheshire3-load script',
                                                 missingDependencies
                                                 )
            wf = db.get_object(session, 'buildIndexWorkflow')
            wf.process(session, docFac)
 def setUp(self):
     self.session = Session()
     serverConfig = os.path.join(cheshire3Root, 'configs',
                                 'serverConfig.xml')
     self.server = SimpleServer(self.session, serverConfig)
     for config in self._get_dependencyConfigs():
         identifier = config.get('id')
         self.server.subConfigs[identifier] = config
     # Disable stdout logging
     lgr = self.server.get_path(self.session, 'defaultLogger')
     lgr.minLevel = 60
     # Create object that will be tested
     config = self._get_config()
     self.testObj = makeObjectFromDom(self.session, config, self.server)
Exemple #7
0
 def __init__(self):
     '''
     Sets up the connection with Cheshire3. 
     '''
     self.session = Session()
     self.session.database = 'db_dickens'
     self.serv = SimpleServer(
         self.session,
         os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
     self.db = self.serv.get_object(self.session, self.session.database)
     self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
     self.resultSetStore = self.db.get_object(self.session,
                                              'resultSetStore')
     self.idxStore = self.db.get_object(self.session, 'indexStore')
Exemple #8
0
def main(argv=None):
    """Register a Database configuration file with the Cheshire3 Server."""
    global argparser, session, server
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    # Make path to configfile absolute
    args.configfile = os.path.abspath(os.path.expanduser(args.configfile))
    # Tell the server to register the config file
    server.register_databaseConfigFile(session, args.configfile)
    return 0
Exemple #9
0
    def __init__(self):
        '''
        Set up a cheshire3 session/connection to the database. This initilisation does
        not handle the actual search term (cf. build_and_run_query).
        '''

        self.session = Session()
        self.session.database = 'db_dickens'
        self.serv = SimpleServer(self.session,
                                 os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
                                 )
        self.db = self.serv.get_object(self.session, self.session.database)
        self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
        self.resultSetStore = self.db.get_object(self.session, 'resultSetStore')
        self.idxStore = self.db.get_object(self.session, 'indexStore')
Exemple #10
0
def main(argv=None):
    """Start up a simple app server to serve the SRU application."""
    global argparser, session, server
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    application = SRUWsgiHandler()
    try:
        httpd = make_server(args.hostname, args.port, application)
    except socket.error:
        print ""
    else:
        print """You will be able to access the application at:
    http://{0}:{1}""".format(args.hostname, args.port)
        httpd.serve_forever()
Exemple #11
0
def main(argv=None):
    """Search a Cheshire3 database based on query in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid)
        )
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        qFac = db.get_object(session, 'defaultQueryFactory')
        query = qFac.get_query(session, args.query, format=args.format)
        resultSet = db.search(session, query)
        return _format_resultSet(resultSet,
                                 maximumRecords=args.maxRecs,
                                 startRecord=args.startRec)
# Apache Config:
#<Directory /usr/local/apache2/htdocs/srw>
#  SetHandler mod_python
#  PythonDebug On
#  PythonPath "['/home/cheshire/c3/code', '/usr/local/lib/python2.3/lib-dynload']+sys.path"
#  PythonHandler srwApacheHandler
#</Directory>

# NB. SetHandler, not AddHandler.

cheshirePath = os.environ.get('C3HOME', '/home/cheshire')

session = Session()
session.environment = "apache"
serv = SimpleServer(
    session,
    os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))

configs = {}
serv._cacheDatabases(session)
for db in serv.databases.values():
    if db.get_setting(session, 'SRW') or db.get_setting(session, 'srw'):
        db._cacheProtocolMaps(session)
        map = db.protocolMaps.get('http://www.loc.gov/zing/srw/', None)
        map2 = db.protocolMaps.get('http://www.loc.gov/zing/srw/update/', None)
        configs[map.databaseUrl] = {
            'http://www.loc.gov/zing/srw/': map,
            'http://www.loc.gov/zing/srw/update/': map2
        }

Exemple #13
0
rand = random.Random()

from PyZ3950 import CQLParser

asn1.register_oid(Z3950_QUERY_SQL, SQLQuery)
asn1.register_oid(Z3950_QUERY_CQL, asn1.GeneralString)

from cheshire3.baseObjects import Session, Database, Transformer, Workflow
from cheshire3.server import SimpleServer
from cheshire3 import internal
from cheshire3 import cqlParser

session = Session()
session.environment = "apache"
server = SimpleServer(
    session, os.path.join(internal.cheshire3Root, 'configs',
                          'serverConfig.xml'))
configs = {}
dbmap = {}
server._cacheDatabases(session)
for db in server.databases.values():
    if db.get_setting(session, "z3950"):
        db._cacheProtocolMaps(session)
        map1 = db.protocolMaps.get('http://www.loc.gov/z3950/', None)
        if map1:
            configs[map1.databaseName] = map1
            dbmap[db.id] = map1.databaseName

session.resultSetStore = server.get_path(session, 'resultSetStore')
session.logger = server.get_path(session, 'z3950Logger')
session.configs = configs
Exemple #14
0
def main(argv=None):
    """Initialize a Cheshire 3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        if args.directory.endswith(os.path.sep):
            args.directory = args.directory[:-1]
        # Find local database name to use as basis of database id
        dbid = "db_{0}".format(os.path.basename(args.directory))
        server.log_debug(session,
                         ("database identifier not specified, defaulting to: "
                          "{0}".format(dbid)))
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        # Doesn't exists, so OK to init it
        pass
    else:
        # TODO: check for --force ?
        msg = """database with id '{0}' has already been init'd. \
Please specify a different id using the --database option.""".format(dbid)
        server.log_critical(session, msg)
        raise ValueError(msg)

    # Create a .cheshire3 directory and populate it
    c3_dir = os.path.join(os.path.abspath(args.directory), '.cheshire3')
    for dir_path in [
            c3_dir,
            os.path.join(c3_dir, 'stores'),
            os.path.join(c3_dir, 'indexes'),
            os.path.join(c3_dir, 'logs')
    ]:
        try:
            os.makedirs(dir_path)
        except OSError:
            # Directory already exists
            server.log_warning(session,
                               "directory already exists {0}".format(dir_path))

    # Generate config file(s)
    xmlFilesToWrite = {}

    # Generate Protocol Map(s) (ZeeRex)
    zrx = create_defaultZeerex(dbid, args)
    zrxPath = os.path.join(c3_dir, 'zeerex_sru.xml')
    args.zeerexPath = zrxPath
    xmlFilesToWrite[zrxPath] = zrx

    # Generate generic database config
    dbConfig = create_defaultConfig(dbid, args)
    dbConfigPath = os.path.join(c3_dir, 'config.xml')
    xmlFilesToWrite[dbConfigPath] = dbConfig

    # Generate config for generic selectors
    selectorConfig = create_defaultConfigSelectors()
    path = os.path.join(c3_dir, 'configSelectors.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = selectorConfig

    # Generate config for generic indexes
    indexConfig = create_defaultConfigIndexes()
    path = os.path.join(c3_dir, 'configIndexes.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = indexConfig

    # Generate config for default Workflows
    workflowConfig = create_defaultConfigWorkflows()
    path = os.path.join(c3_dir, 'configWorkflows.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = workflowConfig

    # Write configs to files
    for path, node in xmlFilesToWrite.iteritems():
        with open(path, 'w') as conffh:
            conffh.write(
                etree.tostring(node, pretty_print=True, encoding="utf-8"))

    # Tell the server to register the config file
    server.register_databaseConfigFile(session, dbConfigPath)
    return 0
Exemple #15
0
def main(argv=None):
    """Load data into a Cheshire3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    if irods is None:
        raise MissingDependencyException('icheshire3-load script',
                                         'irods (PyRods)'
                                         )
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid))
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        # Allow for multiple data arguments
        docFac = db.get_object(session, 'defaultDocumentFactory')
        for dataArg in args.data:
            if dataArg.startswith('irods://'):
                parsed = urlsplit(dataArg)
            else:
                # Examine current environment
                status, myEnv = irods.getRodsEnv()
                try:
                    host = myEnv.getRodsHost()
                except AttributeError:
                    host = myEnv.rodsHost
                # Port
                try:
                    myEnv.getRodsPort()
                except AttributeError:
                    port = myEnv.rodsPort
                # User
                try:
                    username = myEnv.getRodsUserName()
                except AttributeError:
                    username = myEnv.rodsUserName
                netloc = '{0}@{1}:{2}'.format(username, host, port)
                try:
                    cqm = myEnv.getRodsCwd()
                except AttributeError:
                    cwd = myEnv.rodsCwd
                path = '/'.join([cwd, dataArg])
                parsed = SplitResult('irods', netloc, path, None, None)
                dataArg = urlunsplit(parsed)
            server.log_debug(session, dataArg)
            if args.format is None or not args.format.startswith('i'):
                fmt = 'irods'
            else:
                fmt = args.format
            server.log_debug(session, fmt)
            try:
                docFac.load(session, dataArg,
                            args.cache, fmt, args.tagname, args.codec)
            except MissingDependencyException as e:
                server.log_critical(session, e.reason)
                missingDependencies =  e.dependencies
                raise MissingDependencyException('cheshire3-load script',
                                                 missingDependencies)
            wf = db.get_object(session, 'buildIndexWorkflow')
            wf.process(session, docFac)
class Cheshire3Engine(BaseEngine):
    #schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), href=ID(stored=True), cfiBase=TEXT(stored=True), spinePos=TEXT(stored=True), content=TEXT)
    #database = 'db_tdo_simple_sru'
    cheshire_metadata_dir = '/cheshire3-metadata'
    session = Session()
    serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
    server = SimpleServer(session, serverConfig)
    queryFactory = None
    db = None
    titleSel = None
    anywhereSel = None
    proxExtractor = None

    def __initializeTitleSelector(self):
        try:
            self.titleSel = self.db.get_object(self.session,
                                               'titleXPathSelector')
        except ObjectDoesNotExistException:
            try:
                self.titleSel = self.db.get_object(self.session,
                                                   'titleSelector')
            except ObjectDoesNotExistException as e:
                logging.error(e)

    def __initializeAnywhereSelector(self):
        try:
            self.anywhereSel = self.db.get_object(self.session,
                                                  'anywhereXPathSelector')
        except ObjectDoesNotExistException as e:
            logging.error(e)

    def __initializeProximityExtractor(self):
        try:
            self.proxExtractor = self.db.get_object(self.session,
                                                    'ProxExtractor')
        except ObjectDoesNotExistException as e:
            logging.error(e)

    def __highlight(self, text, term, n):
        """Searches for text, retrieves n words either side of the text, which are retuned seperately"""
        term_concordance = list()
        text_len = len(text)
        term_len = len(term)
        term_indexes = [w.start() for w in re.finditer(term, text)]
        for idx in term_indexes:
            start = idx - n
            end = text_len if (idx + term_len +
                               n) > text_len else idx + term_len + n
            term_concordance.append(text[start:idx] +
                                    '<b class="match term0">' + term + '</b>' +
                                    text[idx:end])

        return term_concordance

    def open(self):
        """ The Cheshire get_object line should throw an exception if it can't 
        open passed db
        """
        try:
            self.db = self.server.get_object(self.session, self.database_name)
            self.session.database = self.database_name
        except Exception as e:
            logging.error(e)
            logging.error("openning database {} failed".format(
                self.database_name))

    def create(self):
        if not os.path.exists(self.database_path):
            os.makedirs(self.database_path)

        # create cheshire metadata directory if needed, then initialize with empty list
        metadata_path = self.database_path + self.cheshire_metadata_dir
        if not os.path.exists(metadata_path):
            os.makedirs(metadata_path)
        with open(metadata_path + '/' + self.database_name, 'w') as f:
            json.dump({}, f)

        try:
            logging.info("openning database {} to create".format(
                self.database_path))
            os.system("cheshire3-init " + self.database_path + " --database=" +
                      self.database_name)
        except Exception as e:
            logging.error(e)

    def add(self, path='', href='', title='', cfiBase='', spinePos=''):
        # first, index the document in cheshire3 using unix commands
        os.system("cheshire3-load --database=" + self.database_name + ' ' +
                  path)

        doc_md = dict()
        doc_md[href] = {
            'path': path,
            'href': href,
            'title': title,
            'cfiBase': cfiBase,
            'spinePos': spinePos
        }
        # title is not populated, so pulling filename from path prefix
        #filename = path[:path.find('/')] + '.json'
        metadata_path = self.database_path + self.cheshire_metadata_dir
        with open(metadata_path + '/' + self.database_name) as f_in:
            md_dict = json.load(f_in)

        md_dict.update(doc_md)

        with open(metadata_path + '/' + self.database_name, 'w') as f_out:
            json.dump(md_dict, f_out)
        #print "Current Path for directory writing: " + os.getcwd()

    def finished(self):
        """ In Cheshire, there are no cleanup commands that are needed.  The add command
            will index specified documents fully and end, so a finished command is not required.
        """
        pass

    def query(self, q, limit=None):
        """ In Cheshire3, you have to specify an index and query, else it defaults the all index  which utilizes simple extraction.
        """

        if self.queryFactory == None:
            self.queryFactory = self.db.get_object(self.session,
                                                   'defaultQueryFactory')

        if self.titleSel is None:
            self.__initializeTitleSelector()

        if self.anywhereSel is None:
            self.__initializeAnywhereSelector()

        if self.proxExtractor is None:
            self.__initializeProximityExtractor()

        c3Query = self.queryFactory.get_query(self.session, q)
        rs = self.db.search(self.session, c3Query)

        # open up the json file with reader specific attributes
        metadata_path = self.database_path + self.cheshire_metadata_dir
        with open(metadata_path + '/' + self.database_name) as f:
            db_md_dict = json.load(f)

        # loop through recordset, create new results list with dictionary of found values
        results = list()
        for rsi in rs:
            rec = rsi.fetch_record(self.session)
            # check the record titles
            titleData = self.titleSel.process_record(self.session, rec)
            # checking out the proximity attributes
            elems = self.anywhereSel.process_record(self.session, rec)
            doc_dict = self.proxExtractor.process_xpathResult(
                self.session, elems).values()[0]
            concordance = self.__highlight(doc_dict['text'], q, 20)
            pdb.set_trace()
            # extracts document name key
            fn_key = os.path.basename(titleData[3][0])
            # append highlighted concordance to the dictionary
            db_md_dict[fn_key][u'highlight'] = "  ".join(concordance)
            results.append(db_md_dict[fn_key])
        return results
class Cheshire3Engine(BaseEngine):
    #schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), href=ID(stored=True), cfiBase=TEXT(stored=True), spinePos=TEXT(stored=True), content=TEXT)
    #database = 'db_tdo_simple_sru'
    cheshire_metadata_dir = '/cheshire3-metadata'
    session = Session()
    serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
    server = SimpleServer(session, serverConfig)
    queryFactory = None
    db = None
    titleSel = None
    anywhereSel = None
    proxExtractor = None

    def __initializeTitleSelector(self):
        try:
            self.titleSel = self.db.get_object(self.session,
                                               'titleXPathSelector')
        except ObjectDoesNotExistException:
            try:
                self.titleSel = self.db.get_object(self.session,
                                                   'titleSelector')
            except ObjectDoesNotExistException as e:
                print e

    def __initializeAnywhereSelector(self):
        try:
            self.anywhereSel = self.db.get_object(self.session,
                                                  'anywhereXPathSelector')
        except ObjectDoesNotExistException as e:
            print e

    def __initializeProximityExtractor(self):
        try:
            self.proxExtractor = self.db.get_object(self.session,
                                                    'ProxExtractor')
        except ObjectDoesNotExistException as e:
            print e

    def __highlight(self, text, term, n):
        """Searches for text, retrieves n words either side of the text, which are retuned seperately"""
        term_concordance = list()
        text_len = len(text)
        term_len = len(term)
        term_indexes = [w.start() for w in re.finditer(term, text)]
        for idx in term_indexes:
            start = idx - n
            end = text_len if (idx + term_len +
                               n) > text_len else idx + term_len + n
            term_concordance.append(text[start:idx] +
                                    '<b class="match term0">' + term + '</b>' +
                                    text[idx:end])

        return term_concordance

    def open(self):
        """ The Cheshire get_object line should throw an exception if it can't 
        open passed db
        """
        try:
            self.db = self.server.get_object(self.session, self.databaseName)
            self.session.database = self.databaseName
        except Exception as e:
            print e
            print "openning database {} failed".format(self.databaseName)

    def create(self):
        if not os.path.exists(self.databasePath):
            os.makedirs(self.databasePath)

        # create cheshire metadata directory if needed, then initialize with empty list
        metadata_path = self.databasePath + self.cheshire_metadata_dir
        if not os.path.exists(metadata_path):
            os.makedirs(metadata_path)
        with open(metadata_path + '/' + self.databaseName, 'w') as f:
            json.dump({}, f)

        try:
            print "openning database {} to create".format(self.databasePath)
            os.system("cheshire3-init " + self.databasePath + " --database=" +
                      self.databaseName)
        except Exception, e:
            print e
Exemple #18
0
#!/bin/env python

import os, sys

import cheshire3
from cheshire3.baseObjects import Session
from cheshire3.server import SimpleServer
from cheshire3.internal import cheshire3Root
from cheshire3.document import StringDocument

from lxml import etree

# Launch a Cheshire session
session = Session()
serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
serv = SimpleServer(session, serverConfig)

# Grab our objects
db = serv.get_object(session, 'db_dickens')
xmlp = db.get_object(session, 'LxmlParser')
excl = db.get_object(session, 'quoteExcludeSpanSelector')

if '--exclude' in sys.argv:
    
    data = """<div>
<p type="speech" id="BH.c6.p114">
            <s id="BH.c6.s340">
                          <qs/>
                                    "My dear Miss Summerson,"
                          <qe/>
                          said Richard in a whisper,