Exemple #1
0
class ElasticProducer(AbstractBaseProducer):
    def __init__(self, config: str):
        super().__init__(config, config_parser=ElasticProducerConfig)
        self._index = ElasticIndex(**self.configuration.elastic_settings)

    def process(self):
        for results in self._index.scroll(**self.configuration.scroll):
            for record in results:
                key: str = record['_id']
                value: str = json.dumps(record['_source'])
                self.send(key.encode('utf-8'), value.encode('utf-8'))

        self.flush()
        self.close()
Exemple #2
0
class TestElasticIndex(object):

    def setup_class(self):
        self.index = ElasticIndex('test')

    def teardown_class(self):
        self.index.delete_index()

    def test_scroll(self):
        self.index.index_into({'test': True}, 1)
        self.index.index_into({'test': False}, 2)
        self.index.index_into({'test': True}, 3)
        self.index.index_into({'test': False}, 4)
        for i in self.index.scroll():
            assert isinstance(i, list)

    def test_index_into(self):

        result = self.index.index_into({'test': True, 'object': "This is a string"}, 5)
        assert result
        result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6)
        assert not result
        result = self.index.index_into({'test': False}, 'HAN000827182')
        assert result

    def test_search(self):
        data = list()
        data.append({'id': '1234', 'test': True})
        self.index.bulk(data=data, identifier_key='id')
        result = self.index.search()
        assert len(result) == 7

    def test_search_not_unpack(self):
        result = self.index.search(unpack=False)
        assert len(result) == 7

    def test_alias(self):
        self.index.add_to_alias('test1')
        assert self.index.instance.indices.get_alias('test1')
        self.index.remove_from_alias('test1')
        with pytest.raises(NotFoundError):
            self.index.instance.indices.get_alias('test1')

    def test_count(self):
        result = self.index.count()
        assert result == 7
Exemple #3
0
    missing = open('missing.txt', 'w')
    find_all_result = open('find_all.txt', 'w')

    find_all = list()
    descriptive_coverage = list()
    desc_cov_add = list()
    vols_cov = list()
    words = list()

    image_words = set()
    partitur_words = set()

    find_all_w = set()
    find_all_c = set()
    count = 0
    for results in index.scroll(query=query):
        for record in results:


            if record['c-format'] in ['Zeitung', 'Datenbank']:
                c['periodikum'] += 1
                continue

            count += 1
            coverage = record['extent']['coverage']

            match_desc = re.fullmatch('(?P<number>[0-9]+) (?P<word>[A-Za-zöäü \-]+(\.)?)', coverage)
            match_2 = re.match('(?P<number>[0-9]+) (?P<word>[A-Za-zöäü \-]+(\.)?) \(.*\)$', coverage)
            match_3 = re.match('(?P<word>[A-Za-zäöü.]+) (?P<number>[0-9]+([,./\-][0-9]+)?)$', coverage)

            # Words only. Ignore.
Exemple #4
0
    query = {
        '_source': ['search_params.trackurl'],
        'query': {
            'exists': {
                'field': 'search_params.trackurl'
            }
        }
    }

    with open('common-urls.json', 'r') as fp:
        common_urls = json.load(fp)

    pool = Pool(processes=4)
    process_results = list()
    for results in index.scroll(query=query, size=10000):

        for item in results:
            url = item['search_params']['trackurl']
            if not isinstance(url, str):
                print('NO STRING: ', url)
                continue

            url_counter[url] += 1

            if url in common_urls:
                continue

            domain = re.match('http[s]://(www\.)?(.*\.[a-z]{2,3})/(.*)', url)

            if domain:
Exemple #5
0
def enrich_user_data(config):

    for index in config['indexes']:
        instance = ElasticIndex(**index['index'])

        query = {'query': {'match_all': {}}}

        for results in instance.scroll(query=query):
            for item in results:
                identifier = item['identifier']
                database = item['database']
                sys_number = item['identifiers'][database]

                if 'error_tags' in item:
                    item['error_tags'] = set(item['error_tags'])

                total = 0

                # swissbib
                hits, error_tags = swissbib.enrich(identifier)
                item['hits']['swissbib'] = hits
                total += hits['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                # opac
                hits, error_tags = opac.enrich(opac_index, sys_number)
                item['hits']['opac-access'] = hits
                total += hits['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                # aleph
                hits, error_tags = aleph.enrich(aleph_index, sys_number,
                                                database)
                item['hits']['aleph'] = hits
                total += hits['loans']['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                if database == 'dsv05':
                    # e-rara
                    hits, error_tags = e_rara.enrich(e_rara_index, sys_number)
                    item['hits']['e-rara'] = hits
                    total += hits['bau']['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    # e-manuscripta
                    hits, error_tags = e_manuscripta.enrich(
                        e_manuscripta_index, sys_number)
                    item['hits']['e-manuscripta'] = hits
                    total += hits['bau']['total']
                    total += hits['swa']['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    # e-codices
                    hits, doi, error_tags = e_codices.enrich(
                        e_codices_index, sys_number)
                    item['hits']['e-codices'] = hits
                    total += hits['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    if doi is not None:
                        if 'doi' in item['identifiers']:
                            if isinstance(item['identifiers']['doi'], list):
                                item['identifiers']['doi'].append(doi)
                            else:
                                item['identifiers']['doi'] = [
                                    item['identifiers']['doi'], doi
                                ]

                # e-mails dsv05
                # TODO

                item['error_tags'] = list(item['error_tags'])

                item['hits']['total'] = total

                instance.index_into(item, item['identifier'])