Esempio n. 1
0
def build_architecture(data=None):
    # data argument provided for when function run as clean-up - always None
    global session, serv, db, dbPath, docParser, \
    fullTxr, fullSplitTxr, \
    ppFlow, \
    rebuild
    
    # globals line 1: re-establish session; maintain user if possible
    if (session):
        u = session.user
    else:
        u = None
    session = Session()
    session.database = 'db_ead'
    session.environment = 'apache'
    session.user = u
    serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))
    db = serv.get_object(session, 'db_ead')
    dbPath = db.get_path(session, 'defaultPath')
    docParser = db.get_object(session, 'LxmlParser')
    # globals line 4: transformers
    fullTxr = db.get_object(session, 'htmlFullTxr')
    fullSplitTxr = db.get_object(session, 'htmlFullSplitTxr')
    # globals line 5: workflows
    ppFlow = db.get_object(session, 'preParserWorkflow'); ppFlow.load_cache(session, db)
    
    rebuild = False
    
def build_architecture(data=None):
    global rebuild, session, serv, db, dbPath
    global editStore, authStore, instStore, userStore, xmlp
    global docStoreConfigStore

    session = Session()
    session.database = 'db_hubedit'
    session.environment = 'apache'
    # session.user = None
    serv = SimpleServer(session,
                        os.path.join(cheshire3Root,
                                     'configs',
                                     'serverConfig.xml'
                                     )
                        )
    db = serv.get_object(session, 'db_hubedit')

    dbPath = db.get_path(session, 'defaultPath')

    editStore = db.get_object(session, 'editingStore')
    userStore = db.get_object(session, 'hubAuthStore')
    instStore = db.get_object(session, 'institutionStore')
    docStoreConfigStore = db.get_object(session, 'documentStoreConfigStore')
    authStore = db.get_object(session, 'adminAuthStore')
    xmlp = db.get_object(session, 'LxmlParser')

    rebuild = False
Esempio n. 3
0
 def __init__(self):
     self.session = Session()
     self.session.database = 'db_dickens'
     self.serv = SimpleServer(
         self.session,
         os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
     self.db = self.serv.get_object(self.session, self.session.database)
     self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
Esempio n. 4
0
 def setUp(self):
     self.session = baseObjects.Session()
     serverConfig = os.path.join(cheshire3Root, 'configs',
                                 'serverConfig.xml')
     self.server = SimpleServer(self.session, serverConfig)
     # Disable stdout logging
     lgr = self.server.get_path(self.session, 'defaultLogger')
     lgr.minLevel = 60
Esempio n. 5
0
 def __init__(self, session, logger):
     self.session = session
     session.database = 'db_dickens'
     serv = SimpleServer(session,
                         os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
                         )
     self.db = serv.get_object(session, session.database)
     self.concStore = self.db.get_object(session, 'concordanceStore')
     self.collStore = self.db.get_object(session, 'collocateStore')
     self.idxStore = self.db.get_object(session, 'indexStore')
     self.logger = logger
Esempio n. 6
0
def main(argv=None):
    """Unregister a Database from the Cheshire3 Server."""
    global argparser, session, server
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    # Tell the server to unregister the Database
    server.unregister_databaseConfig(session, args.identifier)
    return 0
Esempio n. 7
0
def main(argv=None):
    """Register a Database configuration file with the Cheshire3 Server."""
    global argparser, session, server
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    # Make path to configfile absolute
    args.configfile = os.path.abspath(os.path.expanduser(args.configfile))
    # Tell the server to register the config file
    server.register_databaseConfigFile(session, args.configfile)
    return 0
Esempio n. 8
0
def main(argv=None):
    """Register a Database configuration file with the Cheshire3 Server."""
    global argparser, session, server
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    # Make path to configfile absolute
    args.configfile = os.path.abspath(os.path.expanduser(args.configfile))
    # Tell the server to register the config file
    server.register_databaseConfigFile(session, args.configfile)
    return 0
Esempio n. 9
0
 def __init__(self):
     '''
     Sets up the connection with Cheshire3. 
     '''
     self.session = Session()
     self.session.database = 'db_dickens'
     self.serv = SimpleServer(
         self.session,
         os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
     self.db = self.serv.get_object(self.session, self.session.database)
     self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
     self.resultSetStore = self.db.get_object(self.session,
                                              'resultSetStore')
     self.idxStore = self.db.get_object(self.session, 'indexStore')
Esempio n. 10
0
 def setUp(self):
     self.session = Session()
     serverConfig = os.path.join(cheshire3Root, 'configs',
                                 'serverConfig.xml')
     self.server = SimpleServer(self.session, serverConfig)
     for config in self._get_dependencyConfigs():
         identifier = config.get('id')
         self.server.subConfigs[identifier] = config
     # Disable stdout logging
     lgr = self.server.get_path(self.session, 'defaultLogger')
     lgr.minLevel = 60
     # Create object that will be tested
     config = self._get_config()
     self.testObj = makeObjectFromDom(self.session, config, self.server)
Esempio n. 11
0
def main(argv=None):
    global argparser, lockfilepath, lgr
    global session, server, db, lgr
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session, 
            "database identifier not specified, discovered: {0}".format(dbid))
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        lgr = db.get_path(session, 'defaultLogger')
        pass
    return args.func(args)
Esempio n. 12
0
def main(argv=None):
    """Search a Cheshire3 database based on query in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(session, "database identifier not specified, discovered: {0}".format(dbid))
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(
            dbid
        )
        server.log_critical(session, msg)
        return 2
    else:
        qFac = db.get_object(session, "defaultQueryFactory")
        query = qFac.get_query(session, args.query, format=args.format)
        resultSet = db.search(session, query)
        return _format_resultSet(resultSet, maximumRecords=args.maxRecs, startRecord=args.startRec)
Esempio n. 13
0
    def __init__(self):
        '''
        Set up a cheshire3 session/connection to the database. This initilisation does
        not handle the actual search term (cf. build_and_run_query).
        '''

        self.session = Session()
        self.session.database = 'db_dickens'
        self.serv = SimpleServer(self.session,
                                 os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
                                 )
        self.db = self.serv.get_object(self.session, self.session.database)
        self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
        self.resultSetStore = self.db.get_object(self.session, 'resultSetStore')
        self.idxStore = self.db.get_object(self.session, 'indexStore')
Esempio n. 14
0
def build_architecture(data=None):
    global session, serv, db, qf, xmlp, recordStore, sentenceStore, paragraphStore, resultSetStore, articleTransformer, kwicTransformer
    session = Session()
    session.environment = 'apache'
    session.user = None
    serv = SimpleServer(session,
                        os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
                        )
    
    session.database = 'db_' + databaseName
    db = serv.get_object(session, session.database)
    qf = db.get_object(session, 'defaultQueryFactory')
    xmlp = db.get_object(session, 'LxmlParser')
    recordStore = db.get_object(session, 'recordStore')
    articleTransformer = db.get_object(session, 'article-Txr')
    kwicTransformer = db.get_object(session, 'kwic-Txr')
Esempio n. 15
0
def main(argv=None):
    """Start up a CherryPy server to serve the SRU, OAI-PMH applications."""
    global argparser, c3_session, c3_server
    global sru_app, oaipmh_app  # WSGI Apps
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    c3_session = Session()
    c3_server = SimpleServer(c3_session, args.serverconfig)
    # Init SRU App
    sru_configs = get_configsFromServer(c3_session, c3_server)
    sru_app = SRUWsgiHandler(c3_session, sru_configs)
    # Init OAI-PMH App
    dbs, oaipmh_configs = get_databasesAndConfigs(c3_session, c3_server)
    oaipmh_app = OAIPMHWsgiApplication(c3_session, oaipmh_configs, dbs)
    # Mount various Apps and static directories
    urlmap = URLMap()
    urlmap['/docs'] = make_pkg_resources(None, 'cheshire3', 'docs/build/html')
    urlmap['/api/sru'] = sru_app
    urlmap['/api/oaipmh/2.0'] = oaipmh_app
    url = "http://{0}:{1}/".format(args.hostname, args.port)
    if args.browser:
        webbrowser.open(url)
        print("Hopefully a new browser window/tab should have opened "
              "displaying the application.")
    paste.httpserver.serve(
        urlmap,
        host=args.hostname,
        port=args.port,
    )
Esempio n. 16
0
 def setUp(self):
     self.session = baseObjects.Session()
     serverConfig = os.path.join(cheshire3Root,
                                 'configs',
                                 'serverConfig.xml')
     self.server = SimpleServer(self.session, serverConfig)
     # Disable stdout logging
     lgr = self.server.get_path(self.session, 'defaultLogger')
     lgr.minLevel = 60
Esempio n. 17
0
def build_architecture(data=None):
    global session, serv, db, qf, xmlp, recordStore, resultSetStore, idxStore, articleTransformer, kwicTransformer, proxExtractor, simpleExtractor, adf, fimi2, rule, arm, vecTxr, vectorStore, armTableTxr
    session = Session()
    session.environment = 'apache'
    session.user = None
    serv = SimpleServer(session,
                        os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
                        )
    
    session.database = 'db_' + databaseName
    db = serv.get_object(session, session.database)
    qf = db.get_object(session, 'defaultQueryFactory')
    xmlp = db.get_object(session, 'LxmlParser')
    recordStore = db.get_object(session, 'recordStore')
    resultSetStore = db.get_object(session, 'resultSetStore')
    
    simpleExtractor = db.get_object(session, 'SimpleExtractor')
    proxExtractor = db.get_object(session, 'ProxExtractor')
    articleTransformer = db.get_object(session, 'article-Txr')
    kwicTransformer = db.get_object(session, 'kwic-Txr')
    idxStore = db.get_object(session, 'indexStore')
Esempio n. 18
0
class Cheshire3ObjectTestCase(unittest.TestCase):
    u"""Abstract Base Class for Cheshire3 Test Cases.   
        
    Almost all objects in Cheshire3 require a Session, and a server as its 
    parent, so create these now.
    """

    @classmethod
    def _get_class(cls):
        # Return class of object to test
        return C3Object
    
    def _get_config(self):
        # Return a parsed config for the object to be tested
        return etree.XML('''
        <subConfig id="{0.__name__}">
            <objectType>{0.__module__}.{0.__name__}</objectType>
        </subConfig>
        '''.format(self._get_class()))
    
    def _get_dependencyConfigs(self):
        # Generator of configs for objects on which this object depends
        # e.g. an Index may depends on and IndexStore for storage, and
        # Selectors, Extractors etc.
        return
        yield
    
    def setUp(self):
        self.session = Session()
        serverConfig = os.path.join(cheshire3Root,
                                    'configs',
                                    'serverConfig.xml')
        self.server = SimpleServer(self.session, serverConfig)
        for config in self._get_dependencyConfigs():
            identifier = config.get('id')
            self.server.subConfigs[identifier] = config
        # Disable stdout logging
        lgr = self.server.get_path(self.session, 'defaultLogger')
        lgr.minLevel = 60
        # Create object that will be tested
        config = self._get_config()
        self.testObj = makeObjectFromDom(self.session, config, self.server)
    
    def tearDown(self):
        pass
    
    def test_serverInstance(self):
        "Check test case's Session instance."
        self.assertIsInstance(self.server, SimpleServer)
        
    def test_instance(self):
        "Check that C3Object is an instance of the expected class."
        self.assertIsInstance(self.testObj, self._get_class())
Esempio n. 19
0
class Cheshire3ObjectTestCase(unittest.TestCase):
    u"""Abstract Base Class for Cheshire3 Test Cases.   
        
    Almost all objects in Cheshire3 require a Session, and a server as its 
    parent, so create these now.
    """
    @classmethod
    def _get_class(cls):
        # Return class of object to test
        return C3Object

    def _get_config(self):
        # Return a parsed config for the object to be tested
        return etree.XML('''
        <subConfig id="{0.__name__}">
            <objectType>{0.__module__}.{0.__name__}</objectType>
        </subConfig>
        '''.format(self._get_class()))

    def _get_dependencyConfigs(self):
        # Generator of configs for objects on which this object depends
        # e.g. an Index may depends on and IndexStore for storage, and
        # Selectors, Extractors etc.
        return
        yield

    def setUp(self):
        self.session = Session()
        serverConfig = os.path.join(cheshire3Root, 'configs',
                                    'serverConfig.xml')
        self.server = SimpleServer(self.session, serverConfig)
        for config in self._get_dependencyConfigs():
            identifier = config.get('id')
            self.server.subConfigs[identifier] = config
        # Disable stdout logging
        lgr = self.server.get_path(self.session, 'defaultLogger')
        lgr.minLevel = 60
        # Create object that will be tested
        config = self._get_config()
        self.testObj = makeObjectFromDom(self.session, config, self.server)

    def tearDown(self):
        pass

    def test_serverInstance(self):
        "Check test case's Session instance."
        self.assertIsInstance(self.server, SimpleServer)

    def test_instance(self):
        "Check that C3Object is an instance of the expected class."
        self.assertIsInstance(self.testObj, self._get_class())
Esempio n. 20
0
 def setUp(self):
     self.session = Session()
     serverConfig = os.path.join(cheshire3Root,
                                 'configs',
                                 'serverConfig.xml')
     self.server = SimpleServer(self.session, serverConfig)
     for config in self._get_dependencyConfigs():
         identifier = config.get('id')
         self.server.subConfigs[identifier] = config
     # Disable stdout logging
     lgr = self.server.get_path(self.session, 'defaultLogger')
     lgr.minLevel = 60
     # Create object that will be tested
     config = self._get_config()
     self.testObj = makeObjectFromDom(self.session, config, self.server)
Esempio n. 21
0
def main(argv=None):
    """Start up a simple app server to serve the SRU application."""
    global argparser, session, server
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    application = SRUWsgiHandler()
    try:
        httpd = make_server(args.hostname, args.port, application)
    except socket.error:
        print ""
    else:
        print """You will be able to access the application at:
    http://{0}:{1}""".format(args.hostname, args.port)
        httpd.serve_forever()
Esempio n. 22
0
def main(argv=None):
    """Search a Cheshire3 database based on query in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid)
        )
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        qFac = db.get_object(session, 'defaultQueryFactory')
        query = qFac.get_query(session, args.query, format=args.format)
        resultSet = db.search(session, query)
        return _format_resultSet(resultSet,
                                 maximumRecords=args.maxRecs,
                                 startRecord=args.startRec)
Esempio n. 23
0
def getCheshire3Env(args):
    """Init and return Cheshire3 Session, Server and Database.

    Intialize Cheshire3 Session, Server and Database objects based on
    ``args``.
    """
    # Create a Session
    session = Session()
    # Get the Server based on given serverConfig file
    server = SimpleServer(session, args.serverconfig)
    # Try to get the Database
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            raise
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid)
        )
    else:
        dbid = args.database
    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        raise
    else:
        # Attach a default Logger to the Session
        session.logger = db.get_path(session, 'defaultLogger')

    return session, server, db
Esempio n. 24
0
class Concordance(object):
    '''
    This concordance takes terms, index names, book selections, and search type
    as input values and returns json with the search term, ten words to the left and
    ten to the right, and location information.

    This can be used in an ajax api.
    '''

    def __init__(self):
        '''
        Set up a cheshire3 session/connection to the database. This initilisation does
        not handle the actual search term (cf. build_and_run_query).
        '''

        self.session = Session()
        self.session.database = 'db_dickens'
        self.serv = SimpleServer(self.session,
                                 os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
                                 )
        self.db = self.serv.get_object(self.session, self.session.database)
        self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
        self.resultSetStore = self.db.get_object(self.session, 'resultSetStore')
        self.idxStore = self.db.get_object(self.session, 'indexStore')
        #self.logger = self.db.get_object(self.session, 'concordanceLogger')


    def build_and_run_query(self, terms, idxName, Materials, selectWords):
        '''
        Builds a cheshire query and runs it.

        Its output is a tuple of which the first element is a resultset and
        the second element is number of search terms in the query.
        '''

        subcorpus = []
        for corpus in Materials:
            MatIdx = 'book-idx'
            # ntc is 19th century?
            if corpus in ['dickens', 'ntc']:
                MatIdx_Vol = 'subCorpus-idx'
                subcorpus.append('c3.{0} = "{1}"'.format(MatIdx_Vol, corpus))
            else:
                subcorpus.append('c3.{0} = "{1}"'.format(MatIdx, corpus))

        ## search whole phrase or individual words?
        if selectWords == "whole":
            # for historic purposes: number_of_search_terms was originally nodeLength
            number_of_search_terms = len(terms.split())
            terms = [terms]
        else:
            #FIXME is this correct in case of an AND search?
            number_of_search_terms = 1
            terms = terms.split()

        ## define search term
        term_clauses = []
        for term in terms:
            term_clauses.append('c3.{0} = "{1}"'.format(idxName, term))

        ## conduct database search
        ## note: /proxInfo needed to search individual books
        query = self.qf.get_query(self.session, ' or '.join(subcorpus) + ' and/proxInfo ' + ' or '.join(term_clauses))
        result_set = self.db.search(self.session, query)

        return result_set, number_of_search_terms


    def create_concordance(self, terms, idxName, Materials, selectWords):
        """
        main concordance method
        create a list of lists containing each three contexts left - node -right,
        and a list within those contexts containing each word.
        Add two separate lists containing metadata information:
        [
        [left context - word 1, word 2, etc.],
        [node - word 1, word 2, etc],
        [right context - word 1, etc],
        [chapter metadata],
        [book metadata]
        ],
        etc.
        """
        ##self.logger.log(10, 'CREATING CONCORDANCE FOR RS: {0} in {1} - {2}'.format(terms, idxName, Materials))

	    #TODO change the variable names of the function itself (Materials -> selection, etc.)

        conc_lines = [] # return concordance lines in list
        word_window = 10 # word_window is set to 10 by default - on both sides of node
        result_set, number_of_search_terms = self.build_and_run_query(terms, idxName, Materials, selectWords)

        ## get total number of hits (not yet used in interface)
        total_count = 0
        if len(result_set) > 0: #FIXME What does cheshire return if there are no results? None? or [] ?
            for result in result_set:
                total_count = total_count + len(result.proxInfo)

        ## search through each record (chapter) and identify location of search term(s)
        if len(result_set) > 0:
            count = 0
            for result in result_set:

                ## get xml record
                rec = result.fetch_record(self.session)

                # Each time a search term is found in a document
                # (each match) is described in terms of a proxInfo.
                #
                # It is insufficiently clear what proxInfo is.
                # It takes the form of three nested lists:
                #
                # [[[0, 169, 1033, 15292]],
                #  [[0, 171, 1045, 15292]], etc. ]
                #
                # We currently assume the following values:
                #
                # * the second item in the deepest list (169, 171)
                #   is the id of the <w> (word) node
                # * the first item is the id of the root element from
                #   which to start counting to find the word node
                #   for instance, 0 for a chapter view (because the chapter
                #   is the root element), but 151 for a search in quotes
                #   text.
                # * the third element is the exact character (spaces, and
                #   and punctuation (stored in <n> (non-word) nodes
                #   at which the search term starts
                # * the fourth element is the total amount of characters
                #   in the document?

                for match in result.proxInfo:
                    count += 1

                    #FIXME will this code be run if there are more than 1000 results? will it not break out of the for loop?
                    #or will it break out of the if loop

                    if count > 1000: ## current search limit: 1000
                        break
                    else: #FIXME while this code be run if there are more than 1000 results? will it not break out of the for loop?

                        if idxName in ['chapter-idx']:
                            word_id = match[0][1]

                        elif idxName in ['quote-idx', 'non-quote-idx', 'longsus-idx', 'shortsus-idx']:
                            eid, word_id = match[0][0], match[0][1]

                            ## locate search term in xml
                            search_term = rec.process_xpath(self.session, '//*[@eid="%d"]/following::w[%d+1]' % (eid, word_id))

                            ## get xml of sentence
                            sentence_tree = rec.process_xpath(self.session, '//*[@eid="%d"]/following::w[%d+1]/ancestor-or-self::s' % (eid, word_id))
                            chapter_tree = rec.process_xpath(self.session, '//*[@eid="%d"]/following::w[%d+1]/ancestor-or-self::div' % (eid, word_id))

                            ## counts words preceding sentence
                            prec_s_tree = chapter_tree[0].xpath('/div/p/s[@sid="%s"]/preceding::s/descendant::w' % sentence_tree[0].get('sid'))
                            prec_s_wcount = len(prec_s_tree)

                            ## count words within sentence
                            count_s = 0
                            for word in chapter_tree[0].xpath('/div/p/s[@sid="%s"]/descendant::w' % sentence_tree[0].get('sid')):
                                if not word.get('o') == search_term[0].get('o'):
                                    count_s += 1
                                else:
                                    break

                            ## word number within chapter is adding word count in preceding sentence and word count in current sentence
                            wcount = prec_s_wcount + count_s
                            #FIXME `w = wcount` dynamically reassigns a value to `w`
                            #that is already a value, namely the one refactored to `word_id`
                            word_id = wcount


                    ## Define leftOnset as w - 10, then get all w and n between that and node
                    leftOnset = max(1, word_id - word_window + 1) ## we operate with word position, not list position (word 1 = 0 position in list)
                    nodeOnset = word_id + 1
                    nodeOffset = word_id + number_of_search_terms
                    try:
                        rightOnset = nodeOffset + 1
                    except:
                        rightOnset = None

                    ch_words = len(rec.process_xpath(self.session, '/div/descendant::w')) ## move to level for each record (chapter) ?
                    rightOffset = min(rightOnset + word_window, rightOnset + (ch_words - rightOnset) + 1 )

                    left_text = []
                    for l in range(leftOnset, nodeOnset):
                        try:
                            left_n_pr = rec.process_xpath(self.session, '/div/descendant::w[%d]/preceding-sibling::n[1]' % l)[0].text
                        except:
                            left_n_pr = ''
                        left_w = rec.process_xpath(self.session, '/div/descendant::w[%d]' % l)[0].text
                        try:
                            left_n_fo = rec.process_xpath(self.session, '/div/descendant::w[%d]/following-sibling::n[1]' % l)[0].text
                        except:
                            left_n_fo = ''
                        left_text.append(''.join(left_n_pr + left_w + left_n_fo))


                    node_text = []
                    for n in range(nodeOnset, rightOnset):
                        try:
                            node_n_pr = rec.process_xpath(self.session, '/div/descendant::w[%d]/preceding-sibling::n[1]' % n)[0].text
                        except:
                            node_n_pr = ''
                        node_w = rec.process_xpath(self.session, '/div/descendant::w[%d]' % n)[0].text
                        try:
                            node_n_fo = rec.process_xpath(self.session, '/div/descendant::w[%d]/following-sibling::n[1]' % n)[0].text
                        except:
                            node_n_fo
                        node_text.append(''.join(node_n_pr + node_w + node_n_fo))

                    right_text = []
                    for r in range(rightOnset, rightOffset):
                        try:
                            right_n_pr = rec.process_xpath(self.session, '/div/descendant::w[%d]/preceding-sibling::n[1]' % r)[0].text
                        except:
                            right_n_pr = ''
                        right_w = rec.process_xpath(self.session, '/div/descendant::w[%d]' % r)[0].text
                        try:
                            right_n_fo = rec.process_xpath(self.session, '/div/descendant::w[%d]/following-sibling::n[1]' % r)[0].text
                        except:
                            right_n_fo = ''
                        right_text.append(''.join(right_n_pr + right_w + right_n_fo))

                    ###
                    book = rec.process_xpath(self.session, '/div')[0].get('book')
                    chapter = rec.process_xpath(self.session, '/div')[0].get('num')
                    para_chap = rec.process_xpath(self.session, '/div/descendant::w[%d+1]/ancestor-or-self::p' % word_id)[0].get('pid')
                    sent_chap = rec.process_xpath(self.session, '/div/descendant::w[%d+1]/ancestor-or-self::s' % word_id)[0].get('sid')
                    word_chap = word_id

                    ## count paragraph, sentence and word in whole book
                    count_para = 0
                    count_sent = 0
                    count_word = 0
                    booktitle = []
                    total_word = []
                    for b in booklist:
                        if b[0][0] == book:

                            booktitle.append(b[0][1])
                            total_word.append(b[1][0][2])

                            for j, c in enumerate(b[2]):
                                while j+1 < int(chapter):
                                    count_para = count_para + int(c[0])
                                    count_sent = count_sent + int(c[1])
                                    count_word = count_word + int(c[2])
                                    j += 1
                                    break

                                ## total word in chapter
                                if j+1 == int(chapter):
                                    chapWordCount = b[2][j][2]

                    book_title = booktitle[0]   ## get book title
                    total_word = total_word[0]
                    para_book = count_para + int(para_chap)
                    sent_book = count_sent + int(sent_chap)
                    word_book = count_word + int(word_chap)

                    conc_line = [left_text, node_text, right_text,
                                [book, book_title, chapter, para_chap, sent_chap, str(word_chap), str(chapWordCount)],
                                [str(para_book), str(sent_book), str(word_book), str(total_word)]]


                    conc_lines.append(conc_line)

        #conc_lines.insert(0, len(conc_lines))
        conc_lines.insert(0, total_count)
        return conc_lines
# Apache Config:
#<Directory /usr/local/apache2/htdocs/srw>
#  SetHandler mod_python
#  PythonDebug On
#  PythonPath "['/home/cheshire/c3/code', '/usr/local/lib/python2.3/lib-dynload']+sys.path"
#  PythonHandler srwApacheHandler
#</Directory>

# NB. SetHandler, not AddHandler.

cheshirePath = os.environ.get('C3HOME', '/home/cheshire')

session = Session()
session.environment = "apache"
serv = SimpleServer(
    session,
    os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))

configs = {}
serv._cacheDatabases(session)
for db in serv.databases.values():
    if db.get_setting(session, 'SRW') or db.get_setting(session, 'srw'):
        db._cacheProtocolMaps(session)
        map = db.protocolMaps.get('http://www.loc.gov/zing/srw/', None)
        map2 = db.protocolMaps.get('http://www.loc.gov/zing/srw/update/', None)
        configs[map.databaseUrl] = {
            'http://www.loc.gov/zing/srw/': map,
            'http://www.loc.gov/zing/srw/update/': map2
        }

Esempio n. 26
0
## count words in books, and list titles
## used to create booklist

import os
import re
from lxml import etree
import json

from cheshire3.document import StringDocument
from cheshire3.internal import cheshire3Root
from cheshire3.server import SimpleServer
from cheshire3.baseObjects import Session

session = Session()
session.database = 'db_dickens'
serv = SimpleServer(session,
                    os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
db = serv.get_object(session, session.database)
qf = db.get_object(session, 'defaultQueryFactory')
resultSetStore = db.get_object(session, 'resultSetStore')
idxStore = db.get_object(session, 'indexStore')

list_books = [
    'BH', 'BR', 'DC', 'DS', 'ED', 'GE', 'HT', 'LD', 'MC', 'NN', 'OCS', 'OMF',
    'OT', 'PP', 'TTC', 'AgnesG', 'Antoni', 'arma', 'cran', 'Deronda',
    'dracula', 'emma', 'frank', 'jane', 'Jude', 'LadyAud', 'mary', 'NorthS',
    'persuasion', 'pride', 'sybil', 'Tess', 'basker', 'Pomp', 'mill', 'dorian',
    'Prof', 'native', 'alli', 'Jekyll', 'wwhite', 'vanity', 'VivianG', 'wh'
]

titles = {
    'BH': 'Bleak House',
Esempio n. 27
0
# -*-coding: utf-8 -*-
# Copyright (C) 2015 by luo xing
# License: GPL

import sys
import os
from cheshire3.baseObjects import Session

from cheshire3.internal import cheshire3Root
from cheshire3.server import SimpleServer

if __name__ == '__main__':
    session = Session()  # 8
    servConfig = os.path.join(cheshire3Root, 'configs',
                              'serverConfig.xml')  # 9
    serv = SimpleServer(session, servConfig)  # 10

    db = serv.get_object(session, 'db_tei')  # 11
    docFac = db.get_object(session, 'defaultDocumentFactory')  # 12
    docParser = db.get_object(session, 'TeiParser')  # 13
    recStore = db.get_object(session, 'TeiRecordStore')  # 14

    docFac.load(
        session, "/home/luoxing/data/test.xml", cache=2, tagName='page')
    db.begin_indexing(session)
    recStore.begin_storing(session)
    for doc in docFac:
        try:
            rec = docParser.process_document(session, doc)
        except:
            print(doc.get_raw(session))
# Apache Config:
#<Directory /usr/local/apache2/htdocs/srw>
#  SetHandler mod_python
#  PythonDebug On
#  PythonPath "['/home/cheshire/c3/code', '/usr/local/lib/python2.3/lib-dynload']+sys.path"
#  PythonHandler srwApacheHandler
#</Directory>

# NB. SetHandler, not AddHandler.

cheshirePath = os.environ.get('C3HOME', '/home/cheshire')

session = Session()
session.environment = "apache"
serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))

configs = {}
serv._cacheDatabases(session)
for db in serv.databases.values():
    if db.get_setting(session, 'SRW') or db.get_setting(session, 'srw'):
        db._cacheProtocolMaps(session)
        map = db.protocolMaps.get('http://www.loc.gov/zing/srw/', None)
        map2 = db.protocolMaps.get('http://www.loc.gov/zing/srw/update/', None)
        configs[map.databaseUrl] = {'http://www.loc.gov/zing/srw/' : map,
                                    'http://www.loc.gov/zing/srw/update/' : map2}


class reqHandler:
    def send_xml(self, text, req, code=200):
        req.content_type = 'text/xml'
Esempio n. 29
0
import random
rand = random.Random()

from PyZ3950 import CQLParser
asn1.register_oid(Z3950_QUERY_SQL, SQLQuery)
asn1.register_oid(Z3950_QUERY_CQL, asn1.GeneralString)

from cheshire3.baseObjects import Session, Database, Transformer, Workflow
from cheshire3.server import SimpleServer
from cheshire3 import internal
from cheshire3 import cqlParser

session = Session()
session.environment = "apache"
server = SimpleServer(session, os.path.join(internal.cheshire3Root, 'configs', 'serverConfig.xml'))
configs = {}
dbmap = {}
server._cacheDatabases(session)
for db in server.databases.values():
    if db.get_setting(session, "z3950"):
        db._cacheProtocolMaps(session)
	map1 = db.protocolMaps.get('http://www.loc.gov/z3950/', None)
	if map1:
	    configs[map1.databaseName] = map1
	    dbmap[db.id] = map1.databaseName

session.resultSetStore = server.get_path(session, 'resultSetStore')
session.logger = server.get_path(session, 'z3950Logger')
session.configs = configs
Esempio n. 30
0
        raise NoSetHierarchyError()

    # End Cheshire3OaiServer ------------------------------------------------


def get_databasesAndConfigs(session, serv):
    """Get and return database and config mappings from Server."""
    dbs = {}
    configs = {}
    serv._cacheDatabases(session)
    for db in serv.databases.values():
        if db.get_setting(session, 'oai-pmh'):
            db._cacheProtocolMaps(session)
            pmap = db.protocolMaps.get(
                'http://www.openarchives.org/OAI/2.0/OAI-PMH', None)
            # Check that there's a path and that it can actually be requested
            # from this handler
            if (pmap is not None):
                configs[pmap.databaseName] = pmap
                dbs[pmap.databaseName] = db
    return dbs, configs


# Cheshire3 architecture
session = Session()
serv = SimpleServer(session,
                    os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
lxmlParser = serv.get_object(session, 'LxmlParser')
dbs, configs = get_databasesAndConfigs(session, serv)
c3OaiServers = {}
Esempio n. 31
0
def main(argv=None):
    """Initialize a Cheshire 3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        if args.directory.endswith(os.path.sep):
            args.directory = args.directory[:-1]
        # Find local database name to use as basis of database id
        dbid = "db_{0}".format(os.path.basename(args.directory))
        server.log_debug(session,
                         ("database identifier not specified, defaulting to: "
                          "{0}".format(dbid)))
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        # Doesn't exists, so OK to init it
        pass
    else:
        # TODO: check for --force ?
        msg = """database with id '{0}' has already been init'd. \
Please specify a different id using the --database option.""".format(dbid)
        server.log_critical(session, msg)
        raise ValueError(msg)

    # Create a .cheshire3 directory and populate it
    c3_dir = os.path.join(os.path.abspath(args.directory), '.cheshire3')
    for dir_path in [c3_dir,
                     os.path.join(c3_dir, 'stores'),
                     os.path.join(c3_dir, 'indexes'),
                     os.path.join(c3_dir, 'logs')]:
        try:
            os.makedirs(dir_path)
        except OSError:
            # Directory already exists
            server.log_warning(
                session,
                "directory already exists {0}".format(dir_path)
            )

    # Generate config file(s)
    xmlFilesToWrite = {}

    # Generate Protocol Map(s) (ZeeRex)
    zrx = create_defaultZeerex(dbid, args)
    zrxPath = os.path.join(c3_dir, 'zeerex_sru.xml')
    args.zeerexPath = zrxPath
    xmlFilesToWrite[zrxPath] = zrx

    # Generate generic database config
    dbConfig = create_defaultConfig(dbid, args)
    dbConfigPath = os.path.join(c3_dir, 'config.xml')
    xmlFilesToWrite[dbConfigPath] = dbConfig

    # Generate config for generic selectors
    selectorConfig = create_defaultConfigSelectors()
    path = os.path.join(c3_dir, 'configSelectors.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = selectorConfig

    # Generate config for generic indexes
    indexConfig = create_defaultConfigIndexes()
    path = os.path.join(c3_dir, 'configIndexes.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = indexConfig

    # Generate config for default Workflows
    workflowConfig = create_defaultConfigWorkflows()
    path = os.path.join(c3_dir, 'configWorkflows.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = workflowConfig

    # Write configs to files
    for path, node in xmlFilesToWrite.iteritems():
        with open(path, 'w') as conffh:
            conffh.write(etree.tostring(node,
                                        pretty_print=True,
                                        encoding="utf-8"
                                        )
                         )

    # Tell the server to register the config file
    server.register_databaseConfigFile(session, dbConfigPath)
    return 0
Esempio n. 32
0
#!/bin/env python

import os, sys

import cheshire3
from cheshire3.baseObjects import Session
from cheshire3.server import SimpleServer
from cheshire3.internal import cheshire3Root
from cheshire3.document import StringDocument

from lxml import etree

# Launch a Cheshire session
session = Session()
serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
serv = SimpleServer(session, serverConfig)

# Grab our objects
db = serv.get_object(session, 'db_dickens')
xmlp = db.get_object(session, 'LxmlParser')
excl = db.get_object(session, 'quoteExcludeSpanSelector')

if '--exclude' in sys.argv:
    
    data = """<div>
<p type="speech" id="BH.c6.p114">
            <s id="BH.c6.s340">
                          <qs/>
                                    "My dear Miss Summerson,"
                          <qe/>
                          said Richard in a whisper,
#!/usr/bin/python
import sys
import os
from cheshire3.baseObjects import Session
from cheshire3.server import SimpleServer
from cheshire3.internal import cheshire3Root

# Build environment...
session = Session() # a Session - used to store
print cheshire3Root

serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
session.logger = serv.get_path(session, 'defaultLogger') # a logger
db = serv.get_object(session, 'db_tdo_index') # the Database
session.database = db.id

#qf = db.get_object(session, 'defaultQueryFactory')

def testVec():
    recordStore = db.get_object(session, 'recordStore')
    rec = recordStore.fetch_record(session, 1)
    idx= db.get_object(session, 'idx-topic')
    vec = idx.fetch_vector(session, rec)
Esempio n. 34
0
#!/bin/env python

import os
import sys

import cheshire3
from cheshire3.baseObjects import Session
from cheshire3.server import SimpleServer
from cheshire3.internal import cheshire3Root

# Launch a Cheshire session
session = Session()
serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
serv = SimpleServer(session, serverConfig)


# Grab our objects
db = serv.get_object(session, 'db_test_pgsql')
recStore = db.get_object(session, 'recordStore')
rssStore = db.get_object(session, 'resultSetStore')
qfac = db.get_object(session, 'defaultQueryFactory')

# Prove that we have some records in postgres

rec = recStore.fetch_record(session, 0)
if not rec:
	print "Could not retrieve records, have you run cheshire3-load?"
	sys.exit()

# Make a query, and store results in resultSetStore
try:
Esempio n. 35
0
class ChapterRepository(object):
    '''
    Responsible for providing access to chapter resources within Cheshire.
    '''
    def __init__(self):
        self.session = Session()
        self.session.database = 'db_dickens'
        self.serv = SimpleServer(
            self.session,
            os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
        self.db = self.serv.get_object(self.session, self.session.database)
        self.qf = self.db.get_object(self.session, 'defaultQueryFactory')

    def get_book_title(self, book):
        '''
        Gets the title of a book from the json file booklist.json

        book -- string - the book id/accronym e.g. BH
        '''

        for b in booklist:
            if (b[0][0] == book):
                book_title = b[0][1]

        return book_title

    def get_chapter(self, chapter_number, book):
        '''
        Returns transformed XML for given chapter & book

        chapter_number -- integer
        book -- string - the book id/accronym e.g. BH
        '''

        query = self.qf.get_query(self.session, 'c3.book-idx = "%s"' % book)
        result_set = self.db.search(self.session, query)
        chapter_ptr = result_set[chapter_number - 1]
        chapter = chapter_ptr.fetch_record(self.session)
        transformer = self.db.get_object(self.session, 'chapterView-Txr')
        formatted_chapter = transformer.process_record(
            self.session, chapter).get_raw(self.session)

        book_title = self.get_book_title(book)

        return formatted_chapter, book_title

    def get_raw_chapter(self, chapter_number, book):
        '''
        Returns raw chapter XML for given chapter & book

        chapter_number -- integer
        book -- string - the book id/accronym e.g. BH
        '''

        query = self.qf.get_query(self.session, 'c3.book-idx = "%s"' % book)
        result_set = self.db.search(self.session, query)
        chapter_ptr = result_set[chapter_number - 1]
        chapter = chapter_ptr.fetch_record(self.session)
        return chapter.get_dom(self.session)

    def get_chapter_with_highlighted_search_term(self, chapter_number, book,
                                                 wid, search_term):
        '''
        Returns transformed XML for given chapter & book with the search
        highlighted.

        We create the transformer directly so that we can pass extra parameters
        to it at runtime. In this case the search term.

        chapter_number -- integer
        book -- string - the book id/accronym e.g. BH
        wid -- integer - word index
        search_term -- string - term to highlight
        '''

        raw_chapter = self.get_raw_chapter(chapter_number, book)
        # load our chapter xslt directly as a transformer
        path_to_xsl = CLIC_DIR + "/dbs/dickens/xsl/chapterView.xsl"
        xslt_doc = etree.parse(path_to_xsl)
        transformer = etree.XSLT(xslt_doc)

        terms = search_term.split(' ')

        # pass the search term into our transformer
        transformed_chapter = transformer(raw_chapter,
                                          wid="'%s'" % wid,
                                          numberOfSearchTerms="%s" %
                                          len(terms))
        book_title = self.get_book_title(book)

        # return transformed html
        return etree.tostring(transformed_chapter), book_title
Esempio n. 36
0
def main(argv=None):
    """Load data into a Cheshire3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid)
        )
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        # Allow for multiple data arguments
        docFac = db.get_object(session, 'defaultDocumentFactory')
        for dataArg in args.data:
            try:
                docFac.load(session,
                            dataArg,
                            args.cache,
                            args.format,
                            args.tagname,
                            args.codec
                            )
            except MissingDependencyException as e:
                server.log_critical(session, e.reason)
                missingDependencies = e.dependencies
                raise MissingDependencyException('cheshire3-load script',
                                                 missingDependencies
                                                 )
            wf = db.get_object(session, 'buildIndexWorkflow')
            wf.process(session, docFac)
Esempio n. 37
0
class DynamicTestCase(unittest.TestCase):
    """Base Class for cheshire3.dynamic test cases."""
    
    def _get_objectTypes(self):
        return ['cheshire3.database.SimpleDatabase',
                'cheshire3.database.OptimisingDatabase',
                'cheshire3.documentFactory.SimpleDocumentFactory',
                'cheshire3.documentFactory.ComponentDocumentFactory',
                'cheshire3.documentStore.SimpleDocumentStore',
                'cheshire3.extractor.SimpleExtractor',
                'cheshire3.index.SimpleIndex',
                'cheshire3.index.ProximityIndex',
                'cheshire3.index.RangeIndex',
                'cheshire3.index.BitmapIndex',
                'cheshire3.indexStore.BdbIndexStore',
                'cheshire3.logger.SimpleLogger',
                'cheshire3.logger.FunctionLogger',
                'cheshire3.logger.LoggingLogger',
                'cheshire3.logger.DateTimeFileLogger',
                'cheshire3.logger.MultipleLogger',
                'cheshire3.normalizer.SimpleNormalizer',
                'cheshire3.normalizer.CaseNormalizer',
                'cheshire3.normalizer.SpaceNormalizer',
                'cheshire3.normalizer.RangeNormalizer',
                'cheshire3.objectStore.BdbObjectStore',
                'cheshire3.parser.MinidomParser',
                'cheshire3.parser.SaxParser',
                'cheshire3.parser.LxmlParser',
                'cheshire3.parser.LxmlHtmlParser',
                'cheshire3.parser.MarcParser',
                'cheshire3.preParser.UnicodeDecodePreParser',
                'cheshire3.preParser.HtmlTidyPreParser',
                'cheshire3.preParser.SgmlPreParser',
                'cheshire3.preParser.CharacterEntityPreParser',
                'cheshire3.queryFactory.SimpleQueryFactory',
                'cheshire3.queryStore.SimpleQueryStore',
                'cheshire3.recordStore.BdbRecordStore',
                'cheshire3.resultSetStore.BdbResultSetStore',
                'cheshire3.selector.XPathSelector',
                'cheshire3.selector.SpanXPathSelector',
                'cheshire3.selector.MetadataSelector',
                'cheshire3.tokenizer.RegexpSubTokenizer',
                'cheshire3.tokenizer.SentenceTokenizer',
                'cheshire3.tokenizer.DateTokenizer',
                'cheshire3.tokenizer.PythonTokenizer',
                'cheshire3.tokenMerger.SimpleTokenMerger',
                'cheshire3.tokenMerger.ProximityTokenMerger',
                'cheshire3.tokenMerger.RangeTokenMerger',
                'cheshire3.tokenMerger.NGramTokenMerger',
                'cheshire3.transformer.XmlTransformer',
                'cheshire3.transformer.LxmlXsltTransformer',
                'cheshire3.transformer.MarcTransformer',
                'cheshire3.workflow.SimpleWorkflow',
                'cheshire3.workflow.CachingWorkflow']

    def _get_configFromObjectType(self, objectType):    
        return etree.XML('''\
        <subConfig id="{0}">
          <objectType>{0}</objectType>
        </subConfig>'''.format(objectType))

    def setUp(self):
        self.session = baseObjects.Session()
        serverConfig = os.path.join(cheshire3Root,
                                    'configs',
                                    'serverConfig.xml')
        self.server = SimpleServer(self.session, serverConfig)
        # Disable stdout logging
        lgr = self.server.get_path(self.session, 'defaultLogger')
        lgr.minLevel = 60
Esempio n. 38
0
# Create the parser for the "remove" command
parser_remove = subparsers.add_parser('remove',
                                      help='Remove an existing user')

parser_remove.add_argument('username',
                           type=str,
                           nargs='*',
                           help='Username of the user(s) to remove')
parser_remove.set_defaults(func=remove_user)


# Build environment...
session = Session()
serv = SimpleServer(
    session,
    os.path.join(cheshire3Root,
                 'configs',
                 'serverConfig.xml'
                 )
)
session.database = 'db_ead'
db = serv.get_object(session, 'db_ead')
xmlp = db.get_object(session, 'LxmlParser')
authStore = db.get_object(session, 'hubAuthStore')          # Editors
superAuthStore = db.get_object(session, 'adminAuthStore')   # Hub Staff
instStore = db.get_object(session, 'institutionStore')      # Institutions


if __name__ == '__main__':
    sys.exit(main())
from xml.sax.saxutils import escape
from lxml import etree
from lxml.builder import ElementMaker

from cheshire3.server import SimpleServer
from cheshire3.baseObjects import Session
from cheshire3.utils import flattenTexts
from cheshire3 import cqlParser
from cheshire3 import internal
from cheshire3 import exceptions as c3errors

cheshirePath = os.environ.get('C3HOME', '/home/cheshire')

session = Session()
session.environment = "apache"
serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))

configs = {}

# determine the root URL of this handler

for configitem in apache.config_tree():
    if configitem[0] == "DocumentRoot":
        docRoot = configitem[1].strip("\"'")
handlerUrl = apache.get_handler_root().replace(docRoot, "")


if len(serv.databaseConfigs) < 25:
    # relatively few dbs - we can safely cache them
    serv._cacheDatabases(session)
    for db in serv.databases.itervalues():
Esempio n. 40
0
class DynamicTestCase(unittest.TestCase):
    """Base Class for cheshire3.dynamic test cases."""
    def _get_objectTypes(self):
        return [
            'cheshire3.database.SimpleDatabase',
            'cheshire3.database.OptimisingDatabase',
            'cheshire3.documentFactory.SimpleDocumentFactory',
            'cheshire3.documentFactory.ComponentDocumentFactory',
            'cheshire3.documentStore.SimpleDocumentStore',
            'cheshire3.extractor.SimpleExtractor',
            'cheshire3.index.SimpleIndex', 'cheshire3.index.ProximityIndex',
            'cheshire3.index.RangeIndex', 'cheshire3.index.BitmapIndex',
            'cheshire3.indexStore.BdbIndexStore',
            'cheshire3.logger.SimpleLogger', 'cheshire3.logger.FunctionLogger',
            'cheshire3.logger.LoggingLogger',
            'cheshire3.logger.DateTimeFileLogger',
            'cheshire3.logger.MultipleLogger',
            'cheshire3.normalizer.SimpleNormalizer',
            'cheshire3.normalizer.CaseNormalizer',
            'cheshire3.normalizer.SpaceNormalizer',
            'cheshire3.normalizer.RangeNormalizer',
            'cheshire3.objectStore.BdbObjectStore',
            'cheshire3.parser.MinidomParser', 'cheshire3.parser.SaxParser',
            'cheshire3.parser.LxmlParser', 'cheshire3.parser.LxmlHtmlParser',
            'cheshire3.parser.MarcParser',
            'cheshire3.preParser.UnicodeDecodePreParser',
            'cheshire3.preParser.HtmlTidyPreParser',
            'cheshire3.preParser.SgmlPreParser',
            'cheshire3.preParser.CharacterEntityPreParser',
            'cheshire3.queryFactory.SimpleQueryFactory',
            'cheshire3.queryStore.SimpleQueryStore',
            'cheshire3.recordStore.BdbRecordStore',
            'cheshire3.resultSetStore.BdbResultSetStore',
            'cheshire3.selector.XPathSelector',
            'cheshire3.selector.SpanXPathSelector',
            'cheshire3.selector.MetadataSelector',
            'cheshire3.tokenizer.RegexpSubTokenizer',
            'cheshire3.tokenizer.SentenceTokenizer',
            'cheshire3.tokenizer.DateTokenizer',
            'cheshire3.tokenizer.PythonTokenizer',
            'cheshire3.tokenMerger.SimpleTokenMerger',
            'cheshire3.tokenMerger.ProximityTokenMerger',
            'cheshire3.tokenMerger.RangeTokenMerger',
            'cheshire3.tokenMerger.NGramTokenMerger',
            'cheshire3.transformer.XmlTransformer',
            'cheshire3.transformer.LxmlXsltTransformer',
            'cheshire3.transformer.MarcTransformer',
            'cheshire3.workflow.SimpleWorkflow',
            'cheshire3.workflow.CachingWorkflow'
        ]

    def _get_configFromObjectType(self, objectType):
        return etree.XML('''\
        <subConfig id="{0}">
          <objectType>{0}</objectType>
        </subConfig>'''.format(objectType))

    def setUp(self):
        self.session = baseObjects.Session()
        serverConfig = os.path.join(cheshire3Root, 'configs',
                                    'serverConfig.xml')
        self.server = SimpleServer(self.session, serverConfig)
        # Disable stdout logging
        lgr = self.server.get_path(self.session, 'defaultLogger')
        lgr.minLevel = 60
Esempio n. 41
0
from PyZ3950 import CQLParser
asn1.register_oid(Z3950_QUERY_SQL, SQLQuery)
asn1.register_oid(Z3950_QUERY_CQL, asn1.GeneralString)

from cheshire3.baseObjects import Session, Database, Transformer, Workflow
from cheshire3.server import SimpleServer
from cheshire3 import internal
from cheshire3 import cqlParser

cheshirePath = os.environ.get('C3HOME', '/home/cheshire')

session = Session()
session.environment = "apache"
server = SimpleServer(
    session,
    os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))
configs = {}
dbmap = {}
server._cacheDatabases(session)
for db in server.databases.values():
    if db.get_setting(session, "z3950"):
        db._cacheProtocolMaps(session)
        map1 = db.protocolMaps.get('http://www.loc.gov/z3950/', None)
        if map1:
            configs[map1.databaseName] = map1
            dbmap[db.id] = map1.databaseName

session.resultSetStore = server.get_path(session, 'resultSetStore')
session.logger = server.get_path(session, 'z3950Logger')
session.configs = configs
Esempio n. 42
0
cheshirePath = os.environ.get('C3HOME', '/home/cheshire/')
sys.path.insert(1, os.path.join(cheshirePath, 'cheshire3', 'code'))

from cheshire3.baseObjects import Session
from cheshire3.server import SimpleServer
from cheshire3.document import StringDocument
from cheshire3 import exceptions as c3errors

from cheshire3.web.www_utils import read_file

# import customisable variables
#from localConfig import *

# Build environment...
session = Session()
serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))
session.database = 'db_ead'

db = serv.get_object(session, 'db_ead')
lgr = db.get_path(session, 'defaultLogger')
recordStore = db.get_object(session, 'recordStore')
authStore = db.get_object(session, 'eadAuthStore')
compStore = db.get_object(session, 'componentStore')
clusDocFac = db.get_object(session, 'clusterDocumentFactory')

clusDb = serv.get_object(session, 'db_ead_cluster')
clusRecordStore = clusDb.get_object(session, 'eadClusterStore')

xmlp = db.get_object(session, 'LxmlParser')

lockfilepath = db.get_path(session, 'defaultPath') + '/indexing.lock'
Esempio n. 43
0

def get_databasesAndConfigs(session, serv):
    """Get and return database and config mappings from Server."""
    dbs = {}
    configs = {}
    serv._cacheDatabases(session)
    for db in serv.databases.values():
        if db.get_setting(session, 'oai-pmh'):
            db._cacheProtocolMaps(session)
            pmap = db.protocolMaps.get(
                'http://www.openarchives.org/OAI/2.0/OAI-PMH',
                None
            )
            # Check that there's a path and that it can actually be requested
            # from this handler
            if (pmap is not None):
                configs[pmap.databaseName] = pmap
                dbs[pmap.databaseName] = db
    return dbs, configs


# Cheshire3 architecture
session = Session()
serv = SimpleServer(session,
                    os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
                    )
lxmlParser = serv.get_object(session, 'LxmlParser')
dbs, configs = get_databasesAndConfigs(session, serv)
c3OaiServers = {}
Esempio n. 44
0
class Cheshire3Engine(BaseEngine):
    #schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), href=ID(stored=True), cfiBase=TEXT(stored=True), spinePos=TEXT(stored=True), content=TEXT)
    #database = 'db_tdo_simple_sru'
    cheshire_metadata_dir = '/cheshire3-metadata'
    session = Session()
    serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
    server = SimpleServer(session, serverConfig)
    queryFactory = None
    db = None
    titleSel = None
    anywhereSel = None
    proxExtractor = None

    def __initializeTitleSelector(self):
        try:
            self.titleSel = self.db.get_object(self.session,
                                               'titleXPathSelector')
        except ObjectDoesNotExistException:
            try:
                self.titleSel = self.db.get_object(self.session,
                                                   'titleSelector')
            except ObjectDoesNotExistException as e:
                logging.error(e)

    def __initializeAnywhereSelector(self):
        try:
            self.anywhereSel = self.db.get_object(self.session,
                                                  'anywhereXPathSelector')
        except ObjectDoesNotExistException as e:
            logging.error(e)

    def __initializeProximityExtractor(self):
        try:
            self.proxExtractor = self.db.get_object(self.session,
                                                    'ProxExtractor')
        except ObjectDoesNotExistException as e:
            logging.error(e)

    def __highlight(self, text, term, n):
        """Searches for text, retrieves n words either side of the text, which are retuned seperately"""
        term_concordance = list()
        text_len = len(text)
        term_len = len(term)
        term_indexes = [w.start() for w in re.finditer(term, text)]
        for idx in term_indexes:
            start = idx - n
            end = text_len if (idx + term_len +
                               n) > text_len else idx + term_len + n
            term_concordance.append(text[start:idx] +
                                    '<b class="match term0">' + term + '</b>' +
                                    text[idx:end])

        return term_concordance

    def open(self):
        """ The Cheshire get_object line should throw an exception if it can't 
        open passed db
        """
        try:
            self.db = self.server.get_object(self.session, self.database_name)
            self.session.database = self.database_name
        except Exception as e:
            logging.error(e)
            logging.error("openning database {} failed".format(
                self.database_name))

    def create(self):
        if not os.path.exists(self.database_path):
            os.makedirs(self.database_path)

        # create cheshire metadata directory if needed, then initialize with empty list
        metadata_path = self.database_path + self.cheshire_metadata_dir
        if not os.path.exists(metadata_path):
            os.makedirs(metadata_path)
        with open(metadata_path + '/' + self.database_name, 'w') as f:
            json.dump({}, f)

        try:
            logging.info("openning database {} to create".format(
                self.database_path))
            os.system("cheshire3-init " + self.database_path + " --database=" +
                      self.database_name)
        except Exception as e:
            logging.error(e)

    def add(self, path='', href='', title='', cfiBase='', spinePos=''):
        # first, index the document in cheshire3 using unix commands
        os.system("cheshire3-load --database=" + self.database_name + ' ' +
                  path)

        doc_md = dict()
        doc_md[href] = {
            'path': path,
            'href': href,
            'title': title,
            'cfiBase': cfiBase,
            'spinePos': spinePos
        }
        # title is not populated, so pulling filename from path prefix
        #filename = path[:path.find('/')] + '.json'
        metadata_path = self.database_path + self.cheshire_metadata_dir
        with open(metadata_path + '/' + self.database_name) as f_in:
            md_dict = json.load(f_in)

        md_dict.update(doc_md)

        with open(metadata_path + '/' + self.database_name, 'w') as f_out:
            json.dump(md_dict, f_out)
        #print "Current Path for directory writing: " + os.getcwd()

    def finished(self):
        """ In Cheshire, there are no cleanup commands that are needed.  The add command
            will index specified documents fully and end, so a finished command is not required.
        """
        pass

    def query(self, q, limit=None):
        """ In Cheshire3, you have to specify an index and query, else it defaults the all index  which utilizes simple extraction.
        """

        if self.queryFactory == None:
            self.queryFactory = self.db.get_object(self.session,
                                                   'defaultQueryFactory')

        if self.titleSel is None:
            self.__initializeTitleSelector()

        if self.anywhereSel is None:
            self.__initializeAnywhereSelector()

        if self.proxExtractor is None:
            self.__initializeProximityExtractor()

        c3Query = self.queryFactory.get_query(self.session, q)
        rs = self.db.search(self.session, c3Query)

        # open up the json file with reader specific attributes
        metadata_path = self.database_path + self.cheshire_metadata_dir
        with open(metadata_path + '/' + self.database_name) as f:
            db_md_dict = json.load(f)

        # loop through recordset, create new results list with dictionary of found values
        results = list()
        for rsi in rs:
            rec = rsi.fetch_record(self.session)
            # check the record titles
            titleData = self.titleSel.process_record(self.session, rec)
            # checking out the proximity attributes
            elems = self.anywhereSel.process_record(self.session, rec)
            doc_dict = self.proxExtractor.process_xpathResult(
                self.session, elems).values()[0]
            concordance = self.__highlight(doc_dict['text'], q, 20)
            pdb.set_trace()
            # extracts document name key
            fn_key = os.path.basename(titleData[3][0])
            # append highlighted concordance to the dictionary
            db_md_dict[fn_key][u'highlight'] = "  ".join(concordance)
            results.append(db_md_dict[fn_key])
        return results
Esempio n. 45
0
                        configs[pmap.databaseUrl][1].update(
                            {'http://www.loc.gov/zing/srw/update/': pmap2.id})
            # Remove cached db object
            try:
                del serv.objects[dbid]
            except KeyError:
                pass

        del dbid, db, pmap, pmap2
    return configs


# Cheshire3 architecture
session = Session()
session.environment = "apache"
serv = SimpleServer(session,
                    os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))

protocolMap = {
    'sru': 'http://www.loc.gov/zing/srw/',
    'diag': 'http://www.loc.gov/zing/srw/diagnostic/'
}

recordMap = {
    'dc': 'info:srw/schema/1/dc-v1.1',
    'diag': 'info:srw/schema/1/diagnostic-v1.1',
    'mods': 'info:srw/schema/1/mods-v3.0',
    'onix': 'info:srw/schema/1/onix-v2.0',
    'marcxml': 'info:srw/schema/1/marcxml-v1.1',
    'ead': 'info:srw/schema/1/ead-2002',
    'ccg': 'http://srw.o-r-g.org/schemas/ccg/1.0/',
    'marcsgml': 'http://srw.o-r-g.org/schemas/marcsgml/12.0/',
Esempio n. 46
0
def main(argv=None):
    """Load data into a Cheshire3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    if irods is None:
        raise MissingDependencyException('icheshire3-load script',
                                         'irods (PyRods)'
                                         )
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        try:
            dbid = identify_database(session, os.getcwd())
        except EnvironmentError as e:
            server.log_critical(session, e.message)
            return 1
        server.log_debug(
            session,
            "database identifier not specified, discovered: {0}".format(dbid))
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        msg = """Cheshire3 database {0} does not exist.
Please provide a different database identifier using the --database option.
""".format(dbid)
        server.log_critical(session, msg)
        return 2
    else:
        # Allow for multiple data arguments
        docFac = db.get_object(session, 'defaultDocumentFactory')
        for dataArg in args.data:
            if dataArg.startswith('irods://'):
                parsed = urlsplit(dataArg)
            else:
                # Examine current environment
                status, myEnv = irods.getRodsEnv()
                try:
                    host = myEnv.getRodsHost()
                except AttributeError:
                    host = myEnv.rodsHost
                # Port
                try:
                    myEnv.getRodsPort()
                except AttributeError:
                    port = myEnv.rodsPort
                # User
                try:
                    username = myEnv.getRodsUserName()
                except AttributeError:
                    username = myEnv.rodsUserName
                netloc = '{0}@{1}:{2}'.format(username, host, port)
                try:
                    cqm = myEnv.getRodsCwd()
                except AttributeError:
                    cwd = myEnv.rodsCwd
                path = '/'.join([cwd, dataArg])
                parsed = SplitResult('irods', netloc, path, None, None)
                dataArg = urlunsplit(parsed)
            server.log_debug(session, dataArg)
            if args.format is None or not args.format.startswith('i'):
                fmt = 'irods'
            else:
                fmt = args.format
            server.log_debug(session, fmt)
            try:
                docFac.load(session, dataArg,
                            args.cache, fmt, args.tagname, args.codec)
            except MissingDependencyException as e:
                server.log_critical(session, e.reason)
                missingDependencies =  e.dependencies
                raise MissingDependencyException('cheshire3-load script',
                                                 missingDependencies)
            wf = db.get_object(session, 'buildIndexWorkflow')
            wf.process(session, docFac)
Esempio n. 47
0
def groupDist(dist):
	hits = sum(dist.values())

	occs=0
	for v in dist:
		occs += int(v) * int(dist[v])

	for i in [1,2,3]:
		print "%s\t%s\t%0.2f" % (i, dist[i], float(dist[i])/float(hits) * 100.0)    
	
	fourPlus=0
	for i in range(4,max(dist.keys())):
		try:
			fourPlus += dist[i]
		except:
			continue
	print "4+\t%s\t%0.2f" % (fourPlus, float(fourPlus)/float(hits) * 100.0)    
	
	print "\n%i occurrences in %i articles" % (occs,hits)	

session = Session()
serv = SimpleServer(session, "../../configs/serverConfig.xml")
db = serv.get_object(session, 'db_news')
session.database = 'db_news'

idxStore = db.get_object(session, 'indexStore')
recStore = db.get_object(session, 'recordStore')



Esempio n. 48
0
def main(argv=None):
    """Initialize a Cheshire 3 database based on parameters in argv."""
    global argparser, session, server, db
    if argv is None:
        args = argparser.parse_args()
    else:
        args = argparser.parse_args(argv)
    session = Session()
    server = SimpleServer(session, args.serverconfig)
    if args.database is None:
        if args.directory.endswith(os.path.sep):
            args.directory = args.directory[:-1]
        # Find local database name to use as basis of database id
        dbid = "db_{0}".format(os.path.basename(args.directory))
        server.log_debug(session,
                         ("database identifier not specified, defaulting to: "
                          "{0}".format(dbid)))
    else:
        dbid = args.database

    try:
        db = server.get_object(session, dbid)
    except ObjectDoesNotExistException:
        # Doesn't exists, so OK to init it
        pass
    else:
        # TODO: check for --force ?
        msg = """database with id '{0}' has already been init'd. \
Please specify a different id using the --database option.""".format(dbid)
        server.log_critical(session, msg)
        raise ValueError(msg)

    # Create a .cheshire3 directory and populate it
    c3_dir = os.path.join(os.path.abspath(args.directory), '.cheshire3')
    for dir_path in [
            c3_dir,
            os.path.join(c3_dir, 'stores'),
            os.path.join(c3_dir, 'indexes'),
            os.path.join(c3_dir, 'logs')
    ]:
        try:
            os.makedirs(dir_path)
        except OSError:
            # Directory already exists
            server.log_warning(session,
                               "directory already exists {0}".format(dir_path))

    # Generate config file(s)
    xmlFilesToWrite = {}

    # Generate Protocol Map(s) (ZeeRex)
    zrx = create_defaultZeerex(dbid, args)
    zrxPath = os.path.join(c3_dir, 'zeerex_sru.xml')
    args.zeerexPath = zrxPath
    xmlFilesToWrite[zrxPath] = zrx

    # Generate generic database config
    dbConfig = create_defaultConfig(dbid, args)
    dbConfigPath = os.path.join(c3_dir, 'config.xml')
    xmlFilesToWrite[dbConfigPath] = dbConfig

    # Generate config for generic selectors
    selectorConfig = create_defaultConfigSelectors()
    path = os.path.join(c3_dir, 'configSelectors.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = selectorConfig

    # Generate config for generic indexes
    indexConfig = create_defaultConfigIndexes()
    path = os.path.join(c3_dir, 'configIndexes.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = indexConfig

    # Generate config for default Workflows
    workflowConfig = create_defaultConfigWorkflows()
    path = os.path.join(c3_dir, 'configWorkflows.xml')
    dbConfig = include_configByPath(dbConfig, path)
    xmlFilesToWrite[path] = workflowConfig

    # Write configs to files
    for path, node in xmlFilesToWrite.iteritems():
        with open(path, 'w') as conffh:
            conffh.write(
                etree.tostring(node, pretty_print=True, encoding="utf-8"))

    # Tell the server to register the config file
    server.register_databaseConfigFile(session, dbConfigPath)
    return 0
Esempio n. 49
0
class Chapter_view(object):
    def __init__(self):
        self.session = Session()
        self.session.database = 'db_dickens'
        self.serv = SimpleServer(
            self.session,
            os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
        self.db = self.serv.get_object(self.session, self.session.database)
        self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
        self.resultSetStore = self.db.get_object(self.session,
                                                 'resultSetStore')
        self.idxStore = self.db.get_object(self.session, 'indexStore')

    def search_book(self, book):
        session = self.session
        db = self.db
        qf = self.qf

        book_query = qf.get_query(session, 'c3.book-idx = "%s"' % book)
        book_results = db.search(session, book_query)
        return book_results

    def create_chapterXhtml(self, book_results):
        session = self.session

        book = self.search_book(book_results)

        #chapter_list = [] ## one list per chapter
        chapter_dict = {}
        for ch in book:

            rec = ch.fetch_record(session)
            tree = rec.get_dom(session).getroottree()
            #print etree.tostring(tree)
            title = tree.xpath('//div//title')[0].text  ## for html page
            ch_number = tree.xpath('//div')[0].get('num')  ## for filename

            countwords = 0
            paralist = []  ## para
            for para in tree.xpath('//div//p'):
                paralist.append('<p>')

                spanlist = []
                for i, w in enumerate(para.xpath('./descendant::w')):
                    countwords += 1
                    try:  ## only if there is preceding n
                        ## only print n if not empty (as we add space outside the spans - see *)
                        if not re.match(
                                '[^\s$]|[\W|^--$]',
                                w.xpath('./preceding-sibling::n[1]')[0].text):
                            preceding_n = w.xpath(
                                './preceding-sibling::n[1]')[0].text
                        else:
                            preceding_n = ''
                    except:
                        preceding_n = ''
                    ## only print n if not empty (as we add space outside the spans - see *)
                    try:  ## only if there is following n
                        if not w.xpath(
                                './following-sibling::n[1]')[0].text == ' ':
                            following_n = w.xpath(
                                './following-sibling::n[1]')[0].text
                        else:
                            following_n = ''
                    except:
                        following_n = ''
                    word = preceding_n + w.text + following_n

                    spanlist.append('<span id="%s">%s</span>' %
                                    (countwords, word))

                spans = ' '.join(spanlist)  ## *
                spans = re.sub('--', ' --', spans)

                paralist.append(spans)
                paralist.append('</p>')

            paras = ''.join(paralist)
            chapter = ''.join('<!DOCTYPE html>' + '\n' + 'html lang="en">' +
                              '\n' + '<head>' + '\n' +
                              'meta charset="utf-8">' + '\n' + '<title>' +
                              title + '</title>' + '\n'
                              '</head>' + '\n\n' + '<body>' + '\n\n' + paras +
                              '\n\n' + '</body>' + '\n\n' + '</html>')

            chapter_dict[chapter] = ch_number
            print tree.xpath('//div')[0].get('book'), ch_number
            #break

        return chapter_dict
from oaipmh.error import *

# Cheshire3 architecture
cheshirePath = os.environ.get('C3HOME', '/home/cheshire')

session = Session()

try:
    from mod_python import apache
    from mod_python.util import FieldStorage
except ImportError:
    pass
else:
    session.environment = "apache"
    
serv = SimpleServer(session, os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))
lxmlParser = serv.get_object(session, 'LxmlParser')

configs = {}
dbs = {}

serv._cacheDatabases(session)        
for db in serv.databases.values():
    if db.get_setting(session, 'oai-pmh'):
        db._cacheProtocolMaps(session)
        map = db.protocolMaps.get('http://www.openarchives.org/OAI/2.0/OAI-PMH', None)
        # check that there's a path and that it can actually be requested from this handler
        if (map is not None):
            configs[map.databaseName] = map
            dbs[map.databaseName] = db
Esempio n. 51
0
class Cheshire3WordList(object):
    '''
    Main class used to build Cheshire3 word lists. These
    can be of individual tokens or of clusters (also
    called n-grams or phrases).
    '''
    def __init__(self):
        '''
        Sets up the connection with Cheshire3. 
        '''
        self.session = Session()
        self.session.database = 'db_dickens'
        self.serv = SimpleServer(
            self.session,
            os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
        self.db = self.serv.get_object(self.session, self.session.database)
        self.qf = self.db.get_object(self.session, 'defaultQueryFactory')
        self.resultSetStore = self.db.get_object(self.session,
                                                 'resultSetStore')
        self.idxStore = self.db.get_object(self.session, 'indexStore')

    def build_subcorpus_clauses(self, subcorpora):
        '''
        Takes a list of subcorpora and turns it into a 
        CQL query that Cheshire3 can process.
        '''

        if not isinstance(subcorpora, list):
            raise IOError, 'subcorpora should be a list'

        clauses = []
        for subcorpus in subcorpora:
            if subcorpus in ['dickens', 'ntc']:
                idx = 'subCorpus-idx'
            else:
                idx = 'book-idx'
            clauses.append('c3.{0} = "{1}"'.format(idx, subcorpus))
        return clauses

    def get_facets(self, index_name, subcorpora):
        '''
        Get the actual word counts ('facets') using the 
        index and the list of subcorpora. 
        '''
        clauses = self.build_subcorpus_clauses(subcorpora)
        query = self.qf.get_query(self.session, ' or '.join(clauses))
        results = self.db.search(self.session, query)
        idx = self.db.get_object(self.session, index_name)
        facets = idx.facets(self.session, results)
        return facets

    def facets_to_df(self, facets):
        '''
        Converts the facets into a dataframe that can be manipulated
        more easily.
        '''
        def select_third_value(value):
            '''
            Facets come in the following format:
            [(u'a', (38, 879, 84372)),
             (u'all', (1067, 879, 15104)),
             
            This function returns the third values, respectively 84372 and 15104
            in the example above.
            '''
            return value[2]

        dataframe = pd.DataFrame(facets, columns=['Type', 'Raw facet'])
        dataframe.index += 1

        dataframe['Count'] = dataframe['Raw facet'].apply(select_third_value)
        self.total = dataframe.Count.sum()
        dataframe['Percentage'] = dataframe.Count / self.total * 100
        dataframe['Percentage'] = dataframe['Percentage'].round(decimals=2)
        dataframe.sort_values(by='Count', ascending=False, inplace=True)
        dataframe['Empty'] = ''
        return dataframe

    def wordlist_to_json(self):
        '''
        Returns a json string that is 
        adapted to the CLiC API.
        '''

        # do not work on the original
        wordlist = copy.deepcopy(self.wordlist)
        del wordlist['Raw facet']
        wordlist = wordlist[['Empty', 'Type', 'Count', 'Percentage']]
        return wordlist.to_json(orient='values')

    def build_wordlist(self, index_name, subcorpora):
        '''
        The core method that needs to be called in order to 
        actually generate the keyword list. Once this method is called
        the .wordlist attribute will return the wordlist. 
        '''
        facets = self.get_facets(index_name, subcorpora)
        self.wordlist = self.facets_to_df(facets)
Esempio n. 52
0
#!/bin/env python

import os
import sys

import cheshire3
from cheshire3.baseObjects import Session
from cheshire3.server import SimpleServer
from cheshire3.internal import cheshire3Root

# Launch a Cheshire session
session = Session()
serverConfig = os.path.join(cheshire3Root, "configs", "serverConfig.xml")
serv = SimpleServer(session, serverConfig)


# Grab our objects
db = serv.get_object(session, "db_test_query_highlight")
recStore = db.get_object(session, "recordStore")
qfac = db.get_object(session, "defaultQueryFactory")

q = qfac.get_query(session, 'cql.anywhere = "e f g"')
rs = db.search(session, q)

# rs[0].proxInfo = [[[1, 4, 8, 7], [1, 5, 10, 8], [1, 6, 12, 9]]]
# being element 1,1,1 /  wordoffset 4,5,6  / byteoffset 8,10,12 / termid 7,8,9

rec = rs[0].fetch_record(session)

loqth = db.get_object(session, "LOQTHTransformer")
doc = loqth.process_record(session, rec)
Esempio n. 53
0
from xml.dom.minidom import parseString as domParseString, Document as DomDocument
import time, os
import cStringIO as StringIO

from cheshire3.server import SimpleServer
from cheshire3.utils import elementType
from cheshire3.baseObjects import Session
from cheshire3 import document
from cheshire3.workflow import SimpleWorkflow, CachingWorkflow
from cheshire3 import dynamic
from cheshire3.exceptions import *
from cheshire3.internal import cheshire3Root

session = Session()
session.environment = "apache"
serv = SimpleServer(session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))
mdp = serv.get_object(session, 'defaultParser')

configs = {}
serv._cacheDatabases(session)
for db in serv.databases.values():
    #if db.get_setting(session, 'C3WEP'):
    if db.get_setting(session, 'remoteWorkflow'):
        db._cacheProtocolMaps(session)
        #map = db.protocolMaps.get('c3WorflowExecutionProtocol', None)
        #configs[map.databaseUrl] = {'c3WorflowExecutionProtocol' : map}
        map = db.protocolMaps.get('http://www.cheshire3.org/protocols/workflow/1.0/', None)
        configs[map.databaseUrl] = {'http://www.cheshire3.org/protocols/workflow/1.0/' : map}
        

class reqHandler:
Esempio n. 54
0
# Cheshire3 architecture
cheshirePath = os.environ.get('C3HOME', '/home/cheshire')

session = Session()

try:
    from mod_python import apache
    from mod_python.util import FieldStorage
except ImportError:
    pass
else:
    session.environment = "apache"

serv = SimpleServer(
    session,
    os.path.join(cheshirePath, 'cheshire3', 'configs', 'serverConfig.xml'))
lxmlParser = serv.get_object(session, 'LxmlParser')

configs = {}
dbs = {}

serv._cacheDatabases(session)
for db in serv.databases.values():
    if db.get_setting(session, 'oai-pmh'):
        db._cacheProtocolMaps(session)
        map = db.protocolMaps.get(
            'http://www.openarchives.org/OAI/2.0/OAI-PMH', None)
        # check that there's a path and that it can actually be requested from this handler
        if (map is not None):
            configs[map.databaseName] = map
class Cheshire3Engine(BaseEngine):
    #schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), href=ID(stored=True), cfiBase=TEXT(stored=True), spinePos=TEXT(stored=True), content=TEXT)
    #database = 'db_tdo_simple_sru'
    cheshire_metadata_dir = '/cheshire3-metadata'
    session = Session()
    serverConfig = os.path.join(cheshire3Root, 'configs', 'serverConfig.xml')
    server = SimpleServer(session, serverConfig)
    queryFactory = None
    db = None
    titleSel = None
    anywhereSel = None
    proxExtractor = None

    def __initializeTitleSelector(self):
        try:
            self.titleSel = self.db.get_object(self.session,
                                               'titleXPathSelector')
        except ObjectDoesNotExistException:
            try:
                self.titleSel = self.db.get_object(self.session,
                                                   'titleSelector')
            except ObjectDoesNotExistException as e:
                print e

    def __initializeAnywhereSelector(self):
        try:
            self.anywhereSel = self.db.get_object(self.session,
                                                  'anywhereXPathSelector')
        except ObjectDoesNotExistException as e:
            print e

    def __initializeProximityExtractor(self):
        try:
            self.proxExtractor = self.db.get_object(self.session,
                                                    'ProxExtractor')
        except ObjectDoesNotExistException as e:
            print e

    def __highlight(self, text, term, n):
        """Searches for text, retrieves n words either side of the text, which are retuned seperately"""
        term_concordance = list()
        text_len = len(text)
        term_len = len(term)
        term_indexes = [w.start() for w in re.finditer(term, text)]
        for idx in term_indexes:
            start = idx - n
            end = text_len if (idx + term_len +
                               n) > text_len else idx + term_len + n
            term_concordance.append(text[start:idx] +
                                    '<b class="match term0">' + term + '</b>' +
                                    text[idx:end])

        return term_concordance

    def open(self):
        """ The Cheshire get_object line should throw an exception if it can't 
        open passed db
        """
        try:
            self.db = self.server.get_object(self.session, self.databaseName)
            self.session.database = self.databaseName
        except Exception as e:
            print e
            print "openning database {} failed".format(self.databaseName)

    def create(self):
        if not os.path.exists(self.databasePath):
            os.makedirs(self.databasePath)

        # create cheshire metadata directory if needed, then initialize with empty list
        metadata_path = self.databasePath + self.cheshire_metadata_dir
        if not os.path.exists(metadata_path):
            os.makedirs(metadata_path)
        with open(metadata_path + '/' + self.databaseName, 'w') as f:
            json.dump({}, f)

        try:
            print "openning database {} to create".format(self.databasePath)
            os.system("cheshire3-init " + self.databasePath + " --database=" +
                      self.databaseName)
        except Exception, e:
            print e
Esempio n. 56
0
        sys.stderr.write("for help use --help\n")
        sys.stderr.flush()
        return 2
    except Error as e:
        lgr.log_lvl(session, 40, str(e))
        if debug:
            raise
        return 1


# Build environment...
session = Session()
serv = SimpleServer(
    session,
    os.path.join(cheshire3Root,
                 'configs',
                 'serverConfig.xml'
                 )
)
session.database = 'db_hubedit'

db = serv.get_object(session, 'db_hubedit')
lgr = db.get_path(session, 'defaultLogger')
authStore = db.get_object(session, 'hubAuthStore')
superAuthStore = db.get_object(session, 'hubSuperAuthStore')

xmlp = db.get_object(session, 'LxmlParser')

if __name__ == "__main__":
    sys.exit(main())
Esempio n. 57
0
import sys
import os
import re

from lxml import etree
from lxml.builder import ElementMaker

from cheshire3.baseObjects import Session
from cheshire3.server import SimpleServer
from cheshire3 import cqlParser
from cheshire3.internal import cheshire3Version, cheshire3Root
from cheshire3 import exceptions as c3errors

session = Session()
session.environment = "apache"
serv = SimpleServer(session,
                    os.path.join(cheshire3Root, 'configs', 'serverConfig.xml'))

# find configs for databases permitted to be served by SRU
configs = {}
if len(serv.databaseConfigs) < 25:
    # relatively few dbs - we can safely cache them
    serv._cacheDatabases(session)
    for db in serv.databases.itervalues():
        if (db.get_setting(session, 'SRW') or db.get_setting(session, 'srw')
                or db.get_setting(session, 'SRU')
                or db.get_setting(session, 'sru')):
            db._cacheProtocolMaps(session)
            map = db.protocolMaps.get('http://www.loc.gov/zing/srw/', None)
            # Check that there's a path and that it can actually be requested
            # from this handler
            if (map is not None):