class TestElasticIndex(object): def setup_class(self): self.index = ElasticIndex('test') def teardown_class(self): self.index.delete_index() def test_scroll(self): self.index.index_into({'test': True}, 1) self.index.index_into({'test': False}, 2) self.index.index_into({'test': True}, 3) self.index.index_into({'test': False}, 4) for i in self.index.scroll(): assert isinstance(i, list) def test_index_into(self): result = self.index.index_into({'test': True, 'object': "This is a string"}, 5) assert result result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6) assert not result result = self.index.index_into({'test': False}, 'HAN000827182') assert result def test_search(self): data = list() data.append({'id': '1234', 'test': True}) self.index.bulk(data=data, identifier_key='id') result = self.index.search() assert len(result) == 7 def test_search_not_unpack(self): result = self.index.search(unpack=False) assert len(result) == 7 def test_alias(self): self.index.add_to_alias('test1') assert self.index.instance.indices.get_alias('test1') self.index.remove_from_alias('test1') with pytest.raises(NotFoundError): self.index.instance.indices.get_alias('test1') def test_count(self): result = self.index.count() assert result == 7
def import_data(plattform: str, year: str): index = ElasticIndex("emanus-{}-data-base-{}".format(plattform, year), "doc") with open("emanus-{}-{}.json".format(plattform, year), 'r') as fp: text = json.load(fp) metadata = dict() data = list() for key in text: if key == "data": for item in text[key]: result = item result["identifier"] = item["dimensions"]["pagestem"] data.append(result) else: metadata[key] = text[key] index.bulk(data, identifier_key="identifier") index.index_into(metadata, 0)
def upload_to_elastic(self, what: str, identifier='identifier'): """ Uploads a harvest to a elastic search index. :param what: Which harvest it should upload 'ach', 'proj', 'person', 'org', 'pub' :param identifier: What the identifier inside the data is called (default 'identifier') """ data = list() for root_dir, _, files in os.walk(self.harvester_info[what][2]): for file in files: tree = ElementTree.parse(root_dir + '/' + file) root = purge_namespaces(tree.getroot()) for element in root.findall('./ListRecords/record/metadata/'): data.append(json.loads(xml2json(element, 'parker'))) for item in data: clean_data(item, identifier) index = ElasticIndex(self.elastic_index + what + '_' + date.today().isoformat(), 'publication', self.elastic_url) index.bulk(data, identifier)
for row in ws.iter_rows(min_row=2, min_col=2, max_col=4): doc = dict() system_number = str(row[0].value) while len(system_number) != 9: system_number = '0' + system_number if system_number not in all_data: all_data[system_number] = dict() if 'loans' not in all_data[system_number]: all_data[system_number]['loans'] = dict() all_data[system_number]['loans'][str(row[2].value)] = row[1].value list_of_data = list() for system_number in all_data: data = all_data[system_number] for t in ['loans', 'reservations']: if t in data: total = 0 for y in ['2016', '2017', '2018']: if y not in data[t]: data[t][y] = 0 else: total += data[t][y] data[t]['total'] = total else: data[t] = {'2016': 0, '2017': 0, '2018': 0, 'total': 0} data['identifier'] = system_number list_of_data.append(data) index.bulk(list_of_data, 'identifier')
index_object = captures index_object['identifier'] = identifier identifier += 1 date = re.match( '(?P<day>\d+)\/(?P<month>\w+)\/(?P<year>\d+):' '(?P<hour>\d+):(?P<minute>\d+):(?P<second>\d+) .*$', captures['timestamp']) if date: dates = date.groupdict() index_object['date_parts'] = dates else: logging.warning('Could not parse date time in line "%s".', line) else: logging.warning('Could not parse line "%s".', line) if index_object is not None: index_objects.append(index_object) count += 1 if count == 500: index = ElasticIndex('opac-access', 'log') index.bulk(index_objects, identifier_key='identifier') count = 0 index_objects.clear() index = ElasticIndex('opac-access', 'log') index.bulk(index_objects, identifier_key='identifier')
class BulkElasticConsumer(AbstractBaseConsumer): """ Will attempt to collect a number of messages and then bulk index them. Collection will either wait some time or collect 10'000 messages. Consumer: bootstrap_servers: localhost:9092 client_id: test group_id: elastic-consumer-test auto_offset_reset: earliest Topics: - test ElasticIndex: index: name-of-index doc_type: _doc (default value for elasticsearch 6) url: http://localhost:9200 timeout: 300 IdentifierKey: name-of-key-value (optional, if not specified the Kafka key value will be used.) """ def __init__(self, config, config_class=ElasticConsumerConfig, logger=logging.getLogger(__name__)): super().__init__(config, config_class, logger=logger) self._index = ElasticIndex(**self.configuration.elastic_settings) self._key = self.configuration.key @property def configuration(self) -> ElasticConsumerConfig: return super().configuration def consume(self) -> bool: data = list() messages = self._consumer.poll(100, 10000) if messages: # TODO: Only works if there is a single partition per consumer. As soon as the number of consumers is lower # TODO: or higher than the number of partitions this fails. for message in messages[self._consumer.assignment().pop()]: key = message.key.decode('utf-8') try: value = json.loads(message.value.decode('utf-8')) except JSONDecodeError as ex: self._error_logger.error( "Failed to JSONDecode message: {}.".format( message.value.decode('utf-8'))) value = { 'message': message.value.decode('utf-8'), 'error': '{}'.format(ex) } if self._key not in value: value['_key'] = key data.append(value) now = time.time() if len(data) > 0: result = self._index.bulk(data, self._key, op_type=self.configuration.op_type, upsert=self.configuration.upsert) then = time.time() amount = then - now self._time_logger.info( "Success! Indexed {} messages to {} in {} seconds.".format( len(data), self._index.index, amount)) else: result = False if result: for assignment in self._consumer.assignment(): pos = self._consumer.position(assignment) if pos != self._consumer.committed(assignment): self._consumer.commit( {assignment: OffsetAndMetadata(pos, "")}) return result
for v in mapping[sys_number]['vlids']: for y in ['2016', '2017', '2018']: if sys_number not in result: result[sys_number] = dict() if p not in result[sys_number]: result[sys_number][p] = dict() if v in vlids[p][y]: if y not in result[sys_number]: result[sys_number][p][y] = vlids[p][y][v]['page-views'] else: result[sys_number][p][y] += vlids[p][y][v][ 'page-views'] else: if y not in result[sys_number][p]: result[sys_number][p][y] = 0 elastic_data = list() for sys_number in result: item = dict() item['bau'] = dict() total = 0 for y in result[sys_number]['erara-bau']: item['bau'][y] = result[sys_number]['erara-bau'][y] total += item['bau'][y] item['bau']['total'] = total item['identifier'] = sys_number elastic_data.append(item) index = ElasticIndex('e-rara-data', 'hits') index.bulk(elastic_data, 'identifier')