def testCreateFacet(self):
        fields = {
            'field1': ['value1'],
            'sorted.field3': ['value3'],
            'untokenized.field4': ['value4'],
            'untokenized.field5': ['value5', 'value6'],
            'untokenized.field6': ['value5/value6'],
            'untokenized.field7': ['valuex'],
            'untokenized.field8': [['grandparent', 'parent', 'child'],
                                   ['parent2', 'child']]
        }
        fields2LuceneDoc = Fields2LuceneDoc(
            'tsname',
            fieldRegistry=FieldRegistry(drilldownFields=[
                DrilldownField('untokenized.field4'),
                DrilldownField('untokenized.field5'),
                DrilldownField('untokenized.field6'),
                DrilldownField('untokenized.field8', hierarchical=True),
            ]))
        observer = CallTrace()
        fields2LuceneDoc.addObserver(observer)
        fields2LuceneDoc.ctx.tx = Transaction('tsname')
        fields2LuceneDoc.ctx.tx.locals['id'] = 'identifier'
        for field, values in fields.items():
            for value in values:
                fields2LuceneDoc.addField(field, value)

        consume(fields2LuceneDoc.commit('unused'))

        document = observer.calledMethods[0].kwargs['document']
        searchFields = [
            f for f in document.getFields() if not FacetField.instance_(f)
        ]
        self.assertEquals(['field1', 'sorted.field3', 'untokenized.field7'],
                          [f.name() for f in searchFields])

        facetsFields = [
            FacetField.cast_(f) for f in document.getFields()
            if FacetField.instance_(f)
        ]
        self.assertEquals(6, len(facetsFields))
        self.assertEquals([
            ('untokenized.field8', ['grandparent', 'parent', 'child']),
            ('untokenized.field8', ['parent2', 'child']),
            ('untokenized.field6', ['value5/value6']),
            ('untokenized.field4', ['value4']),
            ('untokenized.field5', ['value5']),
            ('untokenized.field5', ['value6']),
        ], [(f.dim, list(f.path))
            for f in facetsFields])  # Note: a FacetField doesn't have a name
Esempio n. 2
0
 def testIsUntokenized(self):
     registry = FieldRegistry(drilldownFields=[DrilldownField('aDrilldownField')])
     self.assertTrue(registry.isUntokenized('aDrilldownField'))
     self.assertTrue(registry.isUntokenized('untokenized.some.field'))
     self.assertFalse(registry.isUntokenized('other.field'))
     registry.register('fieldname', STRINGFIELD)
     self.assertTrue(registry.isUntokenized('fieldname'))
     registry.register('fieldname', TEXTFIELD)
     self.assertFalse(registry.isUntokenized('fieldname'))
    def testCreateFacet(self):
        fields = {
            'field1': ['value1'],
            'sorted.field3': ['value3'],
            'untokenized.field4': ['value4'],
            'untokenized.field5': ['value5', 'value6'],
            'untokenized.field6': ['value5/value6'],
            'untokenized.field7': ['valuex'],
            'untokenized.field8': [['grandparent', 'parent', 'child'], ['parent2', 'child']]
        }
        fields2LuceneDoc = Fields2LuceneDoc('tsname',
            fieldRegistry=FieldRegistry(drilldownFields=[
                DrilldownField('untokenized.field4'),
                DrilldownField('untokenized.field5'),
                DrilldownField('untokenized.field6'),
                DrilldownField('untokenized.field8', hierarchical=True),
            ])
        )
        observer = CallTrace()
        fields2LuceneDoc.addObserver(observer)
        fields2LuceneDoc.ctx.tx = Transaction('tsname')
        fields2LuceneDoc.ctx.tx.locals['id'] = 'identifier'
        for field, values in fields.items():
            for value in values:
                fields2LuceneDoc.addField(field, value)

        consume(fields2LuceneDoc.commit('unused'))

        fields = observer.calledMethods[0].kwargs['fields']

        searchFields = [f for f in fields if not "path" in f]
        self.assertEquals(['field1', 'sorted.field3', 'untokenized.field7'], [f['name'] for f in searchFields])

        facetsFields = [f for f in fields if "path" in f]
        self.assertEquals(6, len(facetsFields))
        self.assertEquals([
                ('untokenized.field8', ['grandparent', 'parent', 'child']),
                ('untokenized.field8', ['parent2', 'child']),
                ('untokenized.field6', ['value5/value6']),
                ('untokenized.field4', ['value4']),
                ('untokenized.field5', ['value5']),
                ('untokenized.field5', ['value6']),
            ], [(f['name'], f['path']) for f in facetsFields])
Esempio n. 4
0
    def testDrilldownFields(self):
        drilldownFields = [DrilldownField(name='aap'), DrilldownField(name='noot', hierarchical=True)]
        registry = FieldRegistry(drilldownFields=drilldownFields)
        registry.registerDrilldownField(fieldname='mies', multiValued=False)
        self.assertTrue(registry.isDrilldownField('aap'))
        self.assertTrue(registry.isDrilldownField('noot'))
        self.assertTrue(registry.isDrilldownField('mies'))
        self.assertFalse(registry.isDrilldownField('vuur'))
        self.assertFalse(registry.isHierarchicalDrilldown('aap'))
        self.assertTrue(registry.isHierarchicalDrilldown('noot'))
        self.assertTrue(registry.isMultivaluedDrilldown('aap'))
        self.assertTrue(registry.isMultivaluedDrilldown('noot'))
        self.assertFalse(registry.isMultivaluedDrilldown('mies'))
        self.assertTrue(registry.isUntokenized('mies'))

        field = registry.createFacetField("name", ["value"])
        self.assertEqual({
                "type": "FacetField",
                "name": "name",
                "path": ["value"]
            }, field)
Esempio n. 5
0
    def testDrilldownFields(self):
        drilldownFields = [
            DrilldownField(name='aap'),
            DrilldownField(name='noot', hierarchical=True)
        ]
        registry = FieldRegistry(drilldownFields=drilldownFields)
        registry.registerDrilldownField(fieldname='mies', multiValued=False)
        self.assertTrue(registry.isDrilldownField('aap'))
        self.assertTrue(registry.isDrilldownField('noot'))
        self.assertTrue(registry.isDrilldownField('mies'))
        self.assertFalse(registry.isDrilldownField('vuur'))
        self.assertFalse(registry.isHierarchicalDrilldown('aap'))
        self.assertTrue(registry.isHierarchicalDrilldown('noot'))

        facetsConfig = registry.facetsConfig
        dimConfigs = facetsConfig.getDimConfigs()
        self.assertEquals(set(['aap', 'noot', 'mies']),
                          set(dimConfigs.keySet()))
        self.assertFalse(dimConfigs.get('aap').hierarchical)
        self.assertTrue(dimConfigs.get('noot').hierarchical)
        self.assertTrue(dimConfigs.get('noot').multiValued)
        self.assertFalse(dimConfigs.get('mies').multiValued)
 def testDrilldownFieldQuery(self):
     self.fieldRegistry = FieldRegistry(
         [DrilldownField('field', hierarchical=True)])
     self.assertEquals(
         dict(type="TermQuery",
              term=dict(field="field", path=["value"], type="DrillDown")),
         self._convert("field = value"))
     self.assertEquals(
         dict(type="TermQuery",
              term=dict(field="field",
                        path=["value", "value1"],
                        type="DrillDown")),
         self._convert("field = \"value>value1\""))
 def testAddFacetField(self):
     fields2LuceneDoc = Fields2LuceneDoc('tsname',
         fieldRegistry=FieldRegistry(drilldownFields=[
             DrilldownField('untokenized.field'),
         ])
     )
     observer = CallTrace()
     fields2LuceneDoc.addObserver(observer)
     fields2LuceneDoc.ctx.tx = Transaction('tsname')
     fields2LuceneDoc.ctx.tx.locals['id'] = 'identifier'
     fields2LuceneDoc.addField('field', 'value')
     fields2LuceneDoc.addFacetField('untokenized.field', 'untokenized value')
     consume(fields2LuceneDoc.commit('unused'))
     fields = observer.calledMethods[0].kwargs['fields']
     facetsFields = [f for f in fields if "path" in f]
     self.assertEquals(1, len(facetsFields))
 def testAddFacetField(self):
     fields2LuceneDoc = Fields2LuceneDoc(
         'tsname',
         fieldRegistry=FieldRegistry(drilldownFields=[
             DrilldownField('untokenized.field'),
         ]))
     observer = CallTrace()
     fields2LuceneDoc.addObserver(observer)
     fields2LuceneDoc.ctx.tx = Transaction('tsname')
     fields2LuceneDoc.ctx.tx.locals['id'] = 'identifier'
     fields2LuceneDoc.addField('field', 'value')
     fields2LuceneDoc.addFacetField('untokenized.field',
                                    'untokenized value')
     consume(fields2LuceneDoc.commit('unused'))
     document = observer.calledMethods[0].kwargs['document']
     facetsFields = [
         FacetField.cast_(f) for f in document.getFields()
         if FacetField.instance_(f)
     ]
     self.assertEquals(1, len(facetsFields))
Esempio n. 9
0
    def testAdd(self):
        class Factory():
            def __init__(self, observable, untokenizedFieldnames):
                self.observable = observable
                self.untokenizedFieldnames = untokenizedFieldnames

            def fieldsFor(self, fieldname, value):
                raise StopIteration([(fieldname, value)])
                yield
        fieldFactory = Factory

        fieldRegistry = FieldRegistry(drilldownFields=[DrilldownField('drilldown.field')])
        index = FieldsListToLuceneDocument(fieldRegistry, untokenizedFieldnames=[], indexFieldFactory=fieldFactory)
        observer = CallTrace(emptyGeneratorMethods=['addDocument'])
        index.addObserver(observer)
        longSpecialCharacterValue = u'\u041c\u0438\u043d\u0438\u0441\u0442\u0435\u0440\u0441\u0442\u0432\u043e \u0420\u044b\u0431\u043d\u043e\u0439 \u041f\u0440\u043e\u043c\u044b\u0448\u043b\u0435\u043d\u043d\u043e\u0441\u0438 \u0421\u043e\u044e\u0437\u0430 \u0421\u0421\u0420, \u0422\u0438\u0445\u043e\u043e\u043a\u0435\u0430\u043d\u0438\u0441\u043a\u0438\u0439 \u041d\u0430\u0443\u0447\u043d\u043e-\u0418\u0441\u0441\u043b\u0435\u0434\u043e\u0432\u0430\u0442\u0435\u043b\u044c\u0441\u043a\u0438\u0439 \u0418\u043d\u0441\u0442\u0438\u0442\u0443\u0442 \u0420\u044b\u0431\u043d\u043e\u0433\u043e \u0425\u043e\u0437\u044f\u0439\u0441\u0442\u0432\u0430 \u0438 \u041e\u043a\u0435\u0430\u043d\u043e\u0433\u0440\u0430\u0444\u0438\u0438, \u0412\u043b\u0430\u0434\u0438\u0432\u043e\u0441\u0442\u043e\u043a'
        fields = [
            ("field1", "value1"),
            ("field2", "value2"),
            ("drilldown.field", "a drilldown value"),
            ("drilldown.field", longSpecialCharacterValue),
            ("drilldown.field", ['a', 'b']),
            ("drilldown.field", []),
            ("__key__.field", "a key value"),
            ("__key__.field1", 2),
        ]
        consume(index.add(identifier="", fieldslist=fields))
        self.assertEquals(['addDocument'], observer.calledMethodNames())
        fields = observer.calledMethods[0].kwargs['fields']
        self.assertEqual([
                {'name': 'field1', 'type': 'TextField', 'value': 'value1'},
                {'name': 'field2', 'type': 'TextField', 'value': 'value2'},
                {'name': 'drilldown.field', 'type': 'FacetField', 'path': ['a drilldown value']},
                {'name': 'drilldown.field', 'type': 'FacetField', 'path': [longSpecialCharacterValue]},
                {'name': 'drilldown.field', 'type': 'FacetField', 'path': ['a', 'b']},
                {'name': '__key__.field', 'type': 'KeyField', 'value': 'a key value'},
                {'name': '__key__.field1', 'type': 'KeyField', 'value': 2},
            ], fields)
Esempio n. 10
0
 def testIsIndexField(self):
     registry = FieldRegistry(drilldownFields=[DrilldownField(f) for f in ['field2', 'field3']], termVectorFields=['field1', 'field2'])
     self.assertTrue(registry.isIndexField('field1'))
     self.assertTrue(registry.isIndexField('field2'))
     self.assertFalse(registry.isIndexField('field3'))
     self.assertTrue(registry.isIndexField('field4'))
Esempio n. 11
0
 def testDrilldownFieldQuery(self):
     fieldRegistry = FieldRegistry([DrilldownField('field')])
     self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry))
     self.assertConversion(TermQuery(DrillDownQuery.term("$facets", "field", "value")), "field = value")
Esempio n. 12
0
    'long'   : 'http://www.knaw.nl/narcis/1.0/long/',
    'short'  : 'http://www.knaw.nl/narcis/1.0/short/',
    'mods'   : 'http://www.loc.gov/mods/v3',
    'didl'   : 'urn:mpeg:mpeg21:2002:02-DIDL-NS',
    'norm'   : 'http://dans.knaw.nl/narcis/normalized',
})


def untokenizedFieldname(fieldname):
    return UNTOKENIZED_PREFIX + fieldname

UNQUALIFIED_TERM_FIELDS = [('__all__', 1.0)]

drilldownFields = [
    # def __init__(self, name, hierarchical=False, multiValued=True, indexFieldName=None):
    DrilldownField(untokenizedFieldname('meta_repositorygroupid')),
    DrilldownField(untokenizedFieldname('meta_repositoryid')),
    DrilldownField(untokenizedFieldname('meta_collection')),
    DrilldownField(untokenizedFieldname('genre')),
    DrilldownField(untokenizedFieldname('access')),
    DrilldownField(untokenizedFieldname('dd_year')),
    DrilldownField(untokenizedFieldname('status')),
    DrilldownField(untokenizedFieldname('dd_prices')),
    DrilldownField(untokenizedFieldname('dd_werkzaamheid')),
    DrilldownField(untokenizedFieldname('dd_position')),
    DrilldownField(untokenizedFieldname('dd_institute')),
    DrilldownField(untokenizedFieldname('dd_cat')),
    DrilldownField(untokenizedFieldname('dd_thesis')),
    DrilldownField(untokenizedFieldname('dd_penv')),
    DrilldownField(untokenizedFieldname('dd_os')),
    DrilldownField(untokenizedFieldname('dd_cre')),
Esempio n. 13
0
def main(reactor, port, databasePath):
    drilldownFields = [
        DrilldownField('untokenized.field2'),
        DrilldownField('untokenized.fieldHier', hierarchical=True)
    ]

    fieldRegistry = FieldRegistry(drilldownFields)
    luceneSettings = LuceneSettings(fieldRegistry=fieldRegistry,
                                    commitCount=30,
                                    commitTimeout=1,
                                    analyzer=MerescoDutchStemmingAnalyzer())
    lucene = Lucene(path=join(databasePath, 'lucene'),
                    reactor=reactor,
                    name='main',
                    settings=luceneSettings)

    lucene2Settings = LuceneSettings(fieldRegistry=fieldRegistry,
                                     commitTimeout=0.1)
    lucene2 = Lucene(path=join(databasePath, 'lucene2'),
                     reactor=reactor,
                     name='main2',
                     settings=lucene2Settings)

    termNumerator = TermNumerator(path=join(databasePath, 'termNumerator'))

    emptyLuceneSettings = LuceneSettings(commitTimeout=1)
    multiLuceneHelix = (
        MultiLucene(defaultCore='main'),
        (Lucene(path=join(databasePath, 'lucene-empty'),
                reactor=reactor,
                name='empty-core',
                settings=emptyLuceneSettings), ),
        (lucene, ),
        (lucene2, ),
    )
    storageComponent = StorageComponent(
        directory=join(databasePath, 'storage'))

    return \
    (Observable(),
        (ObservableHttpServer(reactor=reactor, port=port),
            (BasicHttpHandler(),
                (ApacheLogger(outputStream=stdout),
                    (PathFilter("/info", excluding=[
                            '/info/version',
                            '/info/name',
                            '/update',
                            '/sru',
                            '/remote',
                            '/via-remote-sru',
                        ]),
                        (DynamicHtml(
                                [dynamicPath],
                                reactor=reactor,
                                indexPage='/info',
                                additionalGlobals={
                                    'VERSION': version,
                                }
                            ),
                        )
                    ),
                    (PathFilter("/info/version"),
                        (StringServer(version, ContentTypePlainText), )
                    ),
                    (PathFilter("/info/name"),
                        (StringServer('Meresco Lucene', ContentTypePlainText),)
                    ),
                    (PathFilter("/static"),
                        (PathRename(lambda path: path[len('/static'):]),
                            (FileServer(staticPath),)
                        )
                    ),
                    (PathFilter("/update_main", excluding=['/update_main2']),
                        uploadHelix(lucene, termNumerator, storageComponent, drilldownFields, fieldRegistry=luceneSettings.fieldRegistry),
                    ),
                    (PathFilter("/update_main2"),
                        uploadHelix(lucene2, termNumerator, storageComponent, drilldownFields, fieldRegistry=lucene2Settings.fieldRegistry),
                    ),
                    (PathFilter('/sru'),
                        (SruParser(defaultRecordSchema='record'),
                            (SruHandler(),
                                (MultiCqlToLuceneQuery(
                                    defaultCore='main',
                                    coreToCqlLuceneQueries={
                                        "main": CqlToLuceneQuery([], luceneSettings=luceneSettings),
                                        "main2": CqlToLuceneQuery([], luceneSettings=lucene2Settings),
                                        "empty-core": CqlToLuceneQuery([], luceneSettings=emptyLuceneSettings),
                                    }),
                                    multiLuceneHelix,
                                ),
                                (SRUTermDrilldown(defaultFormat='xml'),),
                                (SruDuplicateCount(),),
                                (storageComponent,),
                            )
                        )
                    ),
                    (PathFilter('/via-remote-sru'),
                        (SruParser(defaultRecordSchema='record'),
                            (SruHandler(),
                                (LuceneRemote(host='localhost', port=port, path='/remote'),),
                                (SRUTermDrilldown(defaultFormat='xml'),),
                                (SruDuplicateCount(),),
                                (storageComponent,),
                            )
                        )
                    ),
                    (PathFilter('/remote'),
                        (LuceneRemoteService(reactor=reactor),
                            (MultiCqlToLuceneQuery(
                                    defaultCore='main',
                                    coreToCqlLuceneQueries={
                                        "main": CqlToLuceneQuery([], luceneSettings=luceneSettings),
                                        "main2": CqlToLuceneQuery([], luceneSettings=lucene2Settings),
                                        "empty-core": CqlToLuceneQuery([], luceneSettings=emptyLuceneSettings),
                                    }),
                                multiLuceneHelix,
                            )
                        )
                    ),
                    (PathFilter('/autocomplete'),
                        (Autocomplete('localhost', port, '/autocomplete', '__all__', '?', 5, '?', '?'),
                            (lucene,),
                        )
                    )
                )
            )
        )
    )