Esempio n. 1
0
class TestElasticIndex(object):

    def setup_class(self):
        self.index = ElasticIndex('test')

    def teardown_class(self):
        self.index.delete_index()

    def test_scroll(self):
        self.index.index_into({'test': True}, 1)
        self.index.index_into({'test': False}, 2)
        self.index.index_into({'test': True}, 3)
        self.index.index_into({'test': False}, 4)
        for i in self.index.scroll():
            assert isinstance(i, list)

    def test_index_into(self):

        result = self.index.index_into({'test': True, 'object': "This is a string"}, 5)
        assert result
        result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6)
        assert not result
        result = self.index.index_into({'test': False}, 'HAN000827182')
        assert result

    def test_search(self):
        data = list()
        data.append({'id': '1234', 'test': True})
        self.index.bulk(data=data, identifier_key='id')
        result = self.index.search()
        assert len(result) == 7

    def test_search_not_unpack(self):
        result = self.index.search(unpack=False)
        assert len(result) == 7

    def test_alias(self):
        self.index.add_to_alias('test1')
        assert self.index.instance.indices.get_alias('test1')
        self.index.remove_from_alias('test1')
        with pytest.raises(NotFoundError):
            self.index.instance.indices.get_alias('test1')

    def test_count(self):
        result = self.index.count()
        assert result == 7
def import_data(plattform: str, year: str):

    index = ElasticIndex("emanus-{}-data-base-{}".format(plattform, year),
                         "doc")

    with open("emanus-{}-{}.json".format(plattform, year), 'r') as fp:
        text = json.load(fp)
        metadata = dict()
        data = list()
        for key in text:
            if key == "data":
                for item in text[key]:
                    result = item
                    result["identifier"] = item["dimensions"]["pagestem"]
                    data.append(result)
            else:
                metadata[key] = text[key]
        index.bulk(data, identifier_key="identifier")
        index.index_into(metadata, 0)
Esempio n. 3
0
    def upload_to_elastic(self, what: str, identifier='identifier'):
        """
        Uploads a harvest to a elastic search index.

        :param what:        Which harvest it should upload 'ach', 'proj', 'person', 'org', 'pub'
        :param identifier:  What the identifier inside the data is called (default 'identifier')
        """
        data = list()
        for root_dir, _, files in os.walk(self.harvester_info[what][2]):
            for file in files:
                tree = ElementTree.parse(root_dir + '/' + file)
                root = purge_namespaces(tree.getroot())
                for element in root.findall('./ListRecords/record/metadata/'):
                    data.append(json.loads(xml2json(element, 'parker')))

        for item in data:
            clean_data(item, identifier)

        index = ElasticIndex(self.elastic_index + what + '_' + date.today().isoformat(), 'publication',
                             self.elastic_url)
        index.bulk(data, identifier)
    for row in ws.iter_rows(min_row=2, min_col=2, max_col=4):
        doc = dict()
        system_number = str(row[0].value)
        while len(system_number) != 9:
            system_number = '0' + system_number
        if system_number not in all_data:
            all_data[system_number] = dict()
        if 'loans' not in all_data[system_number]:
            all_data[system_number]['loans'] = dict()
        all_data[system_number]['loans'][str(row[2].value)] = row[1].value

    list_of_data = list()
    for system_number in all_data:
        data = all_data[system_number]
        for t in ['loans', 'reservations']:
            if t in data:
                total = 0
                for y in ['2016', '2017', '2018']:
                    if y not in data[t]:
                        data[t][y] = 0
                    else:
                        total += data[t][y]
                data[t]['total'] = total
            else:
                data[t] = {'2016': 0, '2017': 0, '2018': 0, 'total': 0}

        data['identifier'] = system_number
        list_of_data.append(data)

    index.bulk(list_of_data, 'identifier')
Esempio n. 5
0
                index_object = captures
                index_object['identifier'] = identifier
                identifier += 1

                date = re.match(
                    '(?P<day>\d+)\/(?P<month>\w+)\/(?P<year>\d+):'
                    '(?P<hour>\d+):(?P<minute>\d+):(?P<second>\d+) .*$',
                    captures['timestamp'])
                if date:
                    dates = date.groupdict()
                    index_object['date_parts'] = dates
                else:
                    logging.warning('Could not parse date time in line "%s".',
                                    line)
            else:
                logging.warning('Could not parse line "%s".', line)

            if index_object is not None:
                index_objects.append(index_object)

            count += 1

            if count == 500:
                index = ElasticIndex('opac-access', 'log')
                index.bulk(index_objects, identifier_key='identifier')
                count = 0
                index_objects.clear()

        index = ElasticIndex('opac-access', 'log')
        index.bulk(index_objects, identifier_key='identifier')
class BulkElasticConsumer(AbstractBaseConsumer):
    """
    Will attempt to collect a number of messages and then bulk index them. Collection will either wait some time or
    collect 10'000 messages.


    Consumer:
      bootstrap_servers: localhost:9092
      client_id: test
      group_id: elastic-consumer-test
      auto_offset_reset: earliest
    Topics:
      - test
    ElasticIndex:
      index: name-of-index
      doc_type: _doc (default value for elasticsearch 6)
      url: http://localhost:9200
      timeout: 300
    IdentifierKey: name-of-key-value (optional, if not specified the Kafka key value will be used.)
    """
    def __init__(self,
                 config,
                 config_class=ElasticConsumerConfig,
                 logger=logging.getLogger(__name__)):
        super().__init__(config, config_class, logger=logger)
        self._index = ElasticIndex(**self.configuration.elastic_settings)
        self._key = self.configuration.key

    @property
    def configuration(self) -> ElasticConsumerConfig:
        return super().configuration

    def consume(self) -> bool:
        data = list()
        messages = self._consumer.poll(100, 10000)
        if messages:
            # TODO: Only works if there is a single partition per consumer. As soon as the number of consumers is lower
            # TODO: or higher than the number of partitions this fails.
            for message in messages[self._consumer.assignment().pop()]:
                key = message.key.decode('utf-8')
                try:
                    value = json.loads(message.value.decode('utf-8'))
                except JSONDecodeError as ex:
                    self._error_logger.error(
                        "Failed to JSONDecode message: {}.".format(
                            message.value.decode('utf-8')))
                    value = {
                        'message': message.value.decode('utf-8'),
                        'error': '{}'.format(ex)
                    }
                if self._key not in value:
                    value['_key'] = key
                data.append(value)
        now = time.time()
        if len(data) > 0:
            result = self._index.bulk(data,
                                      self._key,
                                      op_type=self.configuration.op_type,
                                      upsert=self.configuration.upsert)
            then = time.time()
            amount = then - now
            self._time_logger.info(
                "Success! Indexed {} messages to {} in {} seconds.".format(
                    len(data), self._index.index, amount))
        else:
            result = False

        if result:
            for assignment in self._consumer.assignment():
                pos = self._consumer.position(assignment)
                if pos != self._consumer.committed(assignment):
                    self._consumer.commit(
                        {assignment: OffsetAndMetadata(pos, "")})

        return result
Esempio n. 7
0
        for v in mapping[sys_number]['vlids']:
            for y in ['2016', '2017', '2018']:
                if sys_number not in result:
                    result[sys_number] = dict()
                if p not in result[sys_number]:
                    result[sys_number][p] = dict()
                if v in vlids[p][y]:
                    if y not in result[sys_number]:
                        result[sys_number][p][y] = vlids[p][y][v]['page-views']
                    else:
                        result[sys_number][p][y] += vlids[p][y][v][
                            'page-views']
                else:
                    if y not in result[sys_number][p]:
                        result[sys_number][p][y] = 0

    elastic_data = list()
    for sys_number in result:
        item = dict()
        item['bau'] = dict()
        total = 0
        for y in result[sys_number]['erara-bau']:
            item['bau'][y] = result[sys_number]['erara-bau'][y]
            total += item['bau'][y]
        item['bau']['total'] = total
        item['identifier'] = sys_number
        elastic_data.append(item)

    index = ElasticIndex('e-rara-data', 'hits')
    index.bulk(elastic_data, 'identifier')