class ElasticProducer(AbstractBaseProducer): def __init__(self, config: str): super().__init__(config, config_parser=ElasticProducerConfig) self._index = ElasticIndex(**self.configuration.elastic_settings) def process(self): for results in self._index.scroll(**self.configuration.scroll): for record in results: key: str = record['_id'] value: str = json.dumps(record['_source']) self.send(key.encode('utf-8'), value.encode('utf-8')) self.flush() self.close()
class TestElasticIndex(object): def setup_class(self): self.index = ElasticIndex('test') def teardown_class(self): self.index.delete_index() def test_scroll(self): self.index.index_into({'test': True}, 1) self.index.index_into({'test': False}, 2) self.index.index_into({'test': True}, 3) self.index.index_into({'test': False}, 4) for i in self.index.scroll(): assert isinstance(i, list) def test_index_into(self): result = self.index.index_into({'test': True, 'object': "This is a string"}, 5) assert result result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6) assert not result result = self.index.index_into({'test': False}, 'HAN000827182') assert result def test_search(self): data = list() data.append({'id': '1234', 'test': True}) self.index.bulk(data=data, identifier_key='id') result = self.index.search() assert len(result) == 7 def test_search_not_unpack(self): result = self.index.search(unpack=False) assert len(result) == 7 def test_alias(self): self.index.add_to_alias('test1') assert self.index.instance.indices.get_alias('test1') self.index.remove_from_alias('test1') with pytest.raises(NotFoundError): self.index.instance.indices.get_alias('test1') def test_count(self): result = self.index.count() assert result == 7
missing = open('missing.txt', 'w') find_all_result = open('find_all.txt', 'w') find_all = list() descriptive_coverage = list() desc_cov_add = list() vols_cov = list() words = list() image_words = set() partitur_words = set() find_all_w = set() find_all_c = set() count = 0 for results in index.scroll(query=query): for record in results: if record['c-format'] in ['Zeitung', 'Datenbank']: c['periodikum'] += 1 continue count += 1 coverage = record['extent']['coverage'] match_desc = re.fullmatch('(?P<number>[0-9]+) (?P<word>[A-Za-zöäü \-]+(\.)?)', coverage) match_2 = re.match('(?P<number>[0-9]+) (?P<word>[A-Za-zöäü \-]+(\.)?) \(.*\)$', coverage) match_3 = re.match('(?P<word>[A-Za-zäöü.]+) (?P<number>[0-9]+([,./\-][0-9]+)?)$', coverage) # Words only. Ignore.
query = { '_source': ['search_params.trackurl'], 'query': { 'exists': { 'field': 'search_params.trackurl' } } } with open('common-urls.json', 'r') as fp: common_urls = json.load(fp) pool = Pool(processes=4) process_results = list() for results in index.scroll(query=query, size=10000): for item in results: url = item['search_params']['trackurl'] if not isinstance(url, str): print('NO STRING: ', url) continue url_counter[url] += 1 if url in common_urls: continue domain = re.match('http[s]://(www\.)?(.*\.[a-z]{2,3})/(.*)', url) if domain:
def enrich_user_data(config): for index in config['indexes']: instance = ElasticIndex(**index['index']) query = {'query': {'match_all': {}}} for results in instance.scroll(query=query): for item in results: identifier = item['identifier'] database = item['database'] sys_number = item['identifiers'][database] if 'error_tags' in item: item['error_tags'] = set(item['error_tags']) total = 0 # swissbib hits, error_tags = swissbib.enrich(identifier) item['hits']['swissbib'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) # opac hits, error_tags = opac.enrich(opac_index, sys_number) item['hits']['opac-access'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) # aleph hits, error_tags = aleph.enrich(aleph_index, sys_number, database) item['hits']['aleph'] = hits total += hits['loans']['total'] for tag in error_tags: item['error_tags'].add(tag) if database == 'dsv05': # e-rara hits, error_tags = e_rara.enrich(e_rara_index, sys_number) item['hits']['e-rara'] = hits total += hits['bau']['total'] for tag in error_tags: item['error_tags'].add(tag) # e-manuscripta hits, error_tags = e_manuscripta.enrich( e_manuscripta_index, sys_number) item['hits']['e-manuscripta'] = hits total += hits['bau']['total'] total += hits['swa']['total'] for tag in error_tags: item['error_tags'].add(tag) # e-codices hits, doi, error_tags = e_codices.enrich( e_codices_index, sys_number) item['hits']['e-codices'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) if doi is not None: if 'doi' in item['identifiers']: if isinstance(item['identifiers']['doi'], list): item['identifiers']['doi'].append(doi) else: item['identifiers']['doi'] = [ item['identifiers']['doi'], doi ] # e-mails dsv05 # TODO item['error_tags'] = list(item['error_tags']) item['hits']['total'] = total instance.index_into(item, item['identifier'])