Ejemplo n.º 1
0
 def tearDown(self):
     self.setupclass = False # make sure super.tearDown deletes all files
     super(AdvancedAPI, self).tearDown()
     FulltextIndex.connect(self.indextype, self.indexlocation,
                           [DocumentRepository()]).destroy()
     TripleStore.connect(self.storetype, self.storelocation,
                         self.storerepository).clear()
Ejemplo n.º 2
0
    def query(self, environ):
        # this is needed -- but the connect call shouldn't neccesarily
        # have to call exists() (one HTTP call)
        idx = FulltextIndex.connect(self.config.indextype,
                                    self.config.indexlocation,
                                    self.repos)
        q, param, pagenum, pagelen, stats = self.parse_parameters(
            environ['QUERY_STRING'], idx)
        ac_query = environ['QUERY_STRING'].endswith("_ac=true")
        exclude_types = environ.get('exclude_types', None)
        boost_types = environ.get('boost_types', None)
        res, pager = idx.query(q=q,
                               pagenum=pagenum,
                               pagelen=pagelen,
                               ac_query=ac_query,
                               exclude_types=exclude_types,
                               boost_types=boost_types,
                               **param)
        mangled = self.mangle_results(res, ac_query)
        # 3.1 create container for results
        res = {"startIndex": pager['firstresult'] - 1,
               "itemsPerPage": int(param.get('_pageSize', '10')),
               "totalResults": pager['totalresults'],
               "duration": None,  # none
               "current": environ['PATH_INFO'] + "?" + environ['QUERY_STRING'],
               "items": mangled}

        # 4. add stats, maybe
        if stats:
            res["statistics"] = self.stats(mangled)
        return res
Ejemplo n.º 3
0
    def _search_run_query(self, queryparams, boost_repos=None):
        idx = FulltextIndex.connect(self.config.indextype,
                                    self.config.indexlocation, self.repos)
        query = queryparams.get('q')
        if isinstance(query, bytes):  # happens on py26
            query = query.decode("utf-8")  # pragma: no cover
#        query += "*"  # we use a simple_query_string query by default,
#                      # and we probably want to do a prefix query (eg
#                      # "personuppgiftslag" should match a label field
#                      # containing "personuppgiftslag (1998:204)",
#                      # therefore the "*"
#
#        # maybe not, though -- seems to conflict with
#        # stemming/indexing, ie "bulvanutredningen*" doesn't match the
#        # indexed "bulvanutredningen" (which has been stemmed to
#        # "bulvanutredning"
        pagenum = int(queryparams.get('p', '1'))
        qpcopy = dict(queryparams)
        # we've changed a parameter name in our internal API:s from
        # "type" to "repo" since ElasticSearch 7.x doesn't have types
        # anymore (and the corresponding data is now stored in a
        # "repo" field), but we haven't changed our URL parameters
        # (yet). In the meantime, map the external type parameter to
        # the internal repo parameter
        if 'type' in qpcopy:
            qpcopy["repo"] = qpcopy.pop("type")
        for x in ('q', 'p'):
            if x in qpcopy:
                del qpcopy[x]
        res, pager = idx.query(query,
                               pagenum=pagenum,
                               boost_repos=boost_repos,
                               **qpcopy)
        return res, pager
Ejemplo n.º 4
0
    def _search_run_query(self, queryparams, boost_types=None):
        idx = FulltextIndex.connect(self.config.indextype,
                                    self.config.indexlocation, self.repos)
        query = queryparams.get('q')
        if isinstance(query, bytes):  # happens on py26
            query = query.decode("utf-8")  # pragma: no cover
#        query += "*"  # we use a simple_query_string query by default,
#                      # and we probably want to do a prefix query (eg
#                      # "personuppgiftslag" should match a label field
#                      # containing "personuppgiftslag (1998:204)",
#                      # therefore the "*"
#
#        # maybe not, though -- seems to conflict with
#        # stemming/indexing, ie "bulvanutredningen*" doesn't match the
#        # indexed "bulvanutredningen" (which has been stemmed to
#        # "bulvanutredning"
        pagenum = int(queryparams.get('p', '1'))
        qpcopy = dict(queryparams)
        for x in ('q', 'p'):
            if x in qpcopy:
                del qpcopy[x]
        res, pager = idx.query(query,
                               pagenum=pagenum,
                               boost_types=boost_types,
                               **qpcopy)
        return res, pager
Ejemplo n.º 5
0
    def test_create(self):
        # First do the basic tests
        super(WhooshBasicIndex,self).test_create()

        # then do more low-level tests
        # 1 assert that some files have been created at the specified location
        self.assertNotEqual(os.listdir(self.location),[])
        # 2 assert that it's really a whoosh index
        self.assertTrue(whoosh.index.exists_in(self.location))

        # 3. assert that the actual schema with whoosh types is, in
        # fact, correct
        got = self.index.index.schema
        want = whoosh.fields.Schema(
            basefile=whoosh.fields.ID(stored=True),
            dcterms_identifier=whoosh.fields.ID(field_boost=16,stored=True),
            dcterms_issued=whoosh.fields.DATETIME(stored=True),
            dcterms_publisher=whoosh.fields.IDLIST(stored=True),
            dcterms_title=whoosh.fields.TEXT(field_boost=4,stored=True),
            rdf_type=whoosh.fields.ID(stored=True, field_boost=1.1), # corresponds to URI not Label
            repo=whoosh.fields.ID(stored=True),
            text=whoosh.fields.TEXT(stored=True),
            uri=whoosh.fields.ID(unique=True, stored=True)
        )
        self.assertEqual(sorted(want.names()), sorted(got.names()))
        for fld in got.names():
            self.assertEqual((fld,want[fld]),(fld,got[fld]))
            
        # finally, try to create again (opening an existing index
        # instead of creating)
        # need mock docrepo
        self.index = FulltextIndex.connect("WHOOSH", self.location, [DocumentRepository()])
Ejemplo n.º 6
0
    def test_create(self):
        # First do the basic tests
        super(WhooshBasicIndex, self).test_create()

        # then do more low-level tests
        # 1 assert that some files have been created at the specified location
        self.assertNotEqual(os.listdir(self.location), [])
        # 2 assert that it's really a whoosh index
        self.assertTrue(whoosh.index.exists_in(self.location))

        # 3. assert that the actual schema with whoosh types is, in
        # fact, correct
        got = self.index.index.schema
        want = whoosh.fields.Schema(
            basefile=whoosh.fields.ID(stored=True),
            dcterms_identifier=whoosh.fields.ID(field_boost=16, stored=True),
            dcterms_issued=whoosh.fields.DATETIME(stored=True),
            dcterms_publisher=whoosh.fields.IDLIST(stored=True),
            dcterms_title=whoosh.fields.TEXT(field_boost=4, stored=True),
            rdf_type=whoosh.fields.ID(
                stored=True, field_boost=1.1),  # corresponds to URI not Label
            repo=whoosh.fields.ID(stored=True),
            text=whoosh.fields.TEXT(stored=True),
            uri=whoosh.fields.ID(unique=True, stored=True))
        self.assertEqual(sorted(want.names()), sorted(got.names()))
        for fld in got.names():
            self.assertEqual((fld, want[fld]), (fld, got[fld]))

        # finally, try to create again (opening an existing index
        # instead of creating)
        # need mock docrepo
        self.index = FulltextIndex.connect("WHOOSH", self.location,
                                           [DocumentRepository()])
Ejemplo n.º 7
0
    def query(self, environ):
        # this is needed -- but the connect call shouldn't neccesarily
        # have to call exists() (one HTTP call)
        idx = FulltextIndex.connect(self.config.indextype,
                                    self.config.indexlocation, self.repos)
        q, param, pagenum, pagelen, stats = self.parse_parameters(
            environ['QUERY_STRING'], idx)
        ac_query = environ['QUERY_STRING'].endswith("_ac=true")
        exclude_types = environ.get('exclude_types', None)
        boost_types = environ.get('boost_types', None)
        res, pager = idx.query(q=q,
                               pagenum=pagenum,
                               pagelen=pagelen,
                               ac_query=ac_query,
                               exclude_types=exclude_types,
                               boost_types=boost_types,
                               **param)
        mangled = self.mangle_results(res, ac_query)
        # 3.1 create container for results
        res = {
            "startIndex": pager['firstresult'] - 1,
            "itemsPerPage": int(param.get('_pageSize', '10')),
            "totalResults": pager['totalresults'],
            "duration": None,  # none
            "current": environ['PATH_INFO'] + "?" + environ['QUERY_STRING'],
            "items": mangled
        }

        # 4. add stats, maybe
        if stats:
            res["statistics"] = self.stats(mangled)
        return res
Ejemplo n.º 8
0
    def setUp(self, mock_requests):
        can = canned((404, "exists-not.json"),
                     create=CREATE_CANNED, method="get")
        mock_requests.get.side_effect = can

        can = canned((200, "create.json"),
                     create=CREATE_CANNED, method="post")
        mock_requests.put.side_effect = can
        self.location = "http://localhost:9200/ferenda/"
        self.index = FulltextIndex.connect("ELASTICSEARCH", self.location, [])
Ejemplo n.º 9
0
    def setUp(self, mock_requests):
        can = canned((404, "exists-not.json"),
                     create=CREATE_CANNED,
                     method="get")
        mock_requests.get.side_effect = can

        can = canned((200, "create.json"), create=CREATE_CANNED, method="put")
        mock_requests.put.side_effect = can
        self.location = "http://localhost:9200/ferenda/"
        self.index = FulltextIndex.connect("ELASTICSEARCH", self.location,
                                           [DocumentRepository()])
Ejemplo n.º 10
0
    def queryindex(self, querystring):
        """Query the system fulltext index and return the IDs/URIs for matching documents.

        :param querystring: The query
        :type querystring: str
        """
        index = FulltextIndex.connect(self.config.indextype,
                                      self.config.indexlocation)
        rows = index.query(querystring)
        for row in rows:
            print("%s (%s): %s" % (row['identifier'], row['about'], row['text']))
Ejemplo n.º 11
0
    def query(self, request, options=None):
        # this is needed -- but the connect call shouldn't neccesarily
        # have to call exists() (one HTTP call)
        idx = FulltextIndex.connect(self.config.indextype,
                                    self.config.indexlocation, self.repos)
        # parse_parameters -> {
        #  "q": "freetext",
        #  "fields": {"dcterms_publisher": ".../org/di",
        #             "dcterms_issued": "2018"}
        #  "pagenum": 1,
        #  "pagelen": 10,
        #  "autocomplete": False,
        #  "exclude_repos": ["mediawiki"],
        #  "boost_repos": [("sfs", 10)],
        #  "include_fragments": False
        # }
        if options is None:
            options = {}
        options.update(self.parse_parameters(request, idx))
        res, pager = idx.query(
            q=options.get("q"),
            pagenum=options.get("pagenum"),
            pagelen=options.get("pagelen"),
            ac_query=options.get("autocomplete"),
            exclude_repos=options.get("exclude_repos"),
            boost_repos=options.get("boost_repos"),
            include_fragments=options.get("include_fragments"),
            **options.get("fields"))
        mangled = self.mangle_results(res, options.get("autocomplete"))
        # 3.1 create container for results
        res = {
            "startIndex": pager['firstresult'] - 1,
            "itemsPerPage": options["pagelen"],
            "totalResults": pager['totalresults'],
            "duration": None,  # none
            "current":
            request.path + "?" + request.query_string.decode("utf-8"),
            "items": mangled
        }

        # 4. add stats, maybe
        if options["stats"]:
            res["statistics"] = self.stats(mangled)

        # 5. possibly trim results for easier json consumption
        if options["autocomplete"]:
            res = res["items"]
        return res
Ejemplo n.º 12
0
 def test_setup(self):
     self.location = mkdtemp()
     self.index = FulltextIndex.connect("WHOOSH", self.location, [DocRepo1(), DocRepo2()])
     # introspecting the schema (particularly if it's derived
     # directly from our definitions, not reverse-engineerded from
     # a Whoosh index on-disk) is useful for eg creating dynamic
     # search forms
     self.assertEqual(self.index.schema(),{'uri':Identifier(),
                                           'repo':Label(),
                                           'basefile':Label(),
                                           'title':Text(boost=4),
                                           'identifier':Label(boost=16),
                                           'text':Text(),
                                           'issued':Datetime(),
                                           'publisher':Label(),
                                           'abstract': Text(boost=2),
                                           'category': Keywords(),
                                           'secret': Boolean(),
                                           'references': URI(),
                                           'category': Keywords()})
     shutil.rmtree(self.location)
Ejemplo n.º 13
0
    def _search_run_query(self, queryparams, boost_types=None):
        idx = FulltextIndex.connect(self.config.indextype,
                                    self.config.indexlocation,
                                    self.repos)
        query = queryparams.get('q')
        if isinstance(query, bytes):  # happens on py26
            query = query.decode("utf-8")  # pragma: no cover
#        query += "*"  # we use a simple_query_string query by default,
#                      # and we probably want to do a prefix query (eg
#                      # "personuppgiftslag" should match a label field
#                      # containing "personuppgiftslag (1998:204)",
#                      # therefore the "*"
#
#        # maybe not, though -- seems to conflict with
#        # stemming/indexing, ie "bulvanutredningen*" doesn't match the
#        # indexed "bulvanutredningen" (which has been stemmed to
#        # "bulvanutredning"
        pagenum = int(queryparams.get('p', '1'))
        qpcopy = dict(queryparams)
        for x in ('q', 'p'):
            if x in qpcopy:
                del qpcopy[x]
        res, pager = idx.query(query, pagenum=pagenum, boost_types=boost_types, **qpcopy)
        return res, pager
Ejemplo n.º 14
0
 def setUp(self):
     self.location = mkdtemp()
     self.index = FulltextIndex.connect("WHOOSH", self.location, self.repos)
Ejemplo n.º 15
0
 def setUp(self):
     self.maxDiff = None
     self.location = "http://localhost:9200/ferenda/"
     self.index = FulltextIndex.connect("ELASTICSEARCH", self.location,
                                        self.repos)
Ejemplo n.º 16
0
 def setUp(self):
     self.location = mkdtemp()
     self.index = FulltextIndex.connect("WHOOSH", self.location, self.repos)
Ejemplo n.º 17
0
 def setUp(self):
     self.maxDiff = None
     self.location = "http://localhost:9200/ferenda/"
     self.index = FulltextIndex.connect("ELASTICSEARCH", self.location, self.repos)
Ejemplo n.º 18
0
 def tearDown(self):
     self.setupclass = False # make sure super.tearDown deletes all files
     super(BasicAPI, self).tearDown()
     FulltextIndex.connect(self.indextype, self.indexlocation,
                           [DocumentRepository()]).destroy()
Ejemplo n.º 19
0
 def setUp(self):
     self.location = mkdtemp()
     self.index = FulltextIndex.connect("WHOOSH", self.location, [DocRepo1(), DocRepo2()])
     self.load(custom_dataset)