def __init__(self,
              config,
              config_class=ElasticConsumerConfig,
              logger=logging.getLogger(__name__)):
     super().__init__(config, config_class, logger=logger)
     self._index = ElasticIndex(**self.configuration.elastic_settings)
     self._key = self.configuration.key
Esempio n. 2
0
 def __init__(self, database, config, logger=logging.getLogger(__name__)):
     super().__init__(logger)
     self._database = database
     self._config = config
     self.marc = None
     self.digidata_index = ElasticIndex(**config['digidata'])
     self.page_conversion_rates = config['page-conversions']
Esempio n. 3
0
    def setup_class(self):
        self.admin = KafkaAdminClient(bootstrap_servers='localhost:9092')
        self.index = ElasticIndex('test-elastic-producer', 'doc')
        self.index.index_into({'test': 1}, 0)
        self.index.index_into({'test': 2}, 1)
        self.index.index_into({'test': 3}, 2)
        self.index.index_into({'test': 4}, 3)
        self.index.index_into({'test': 5}, 4)

        self.producer = ElasticProducer('configs/elastic/test_elastic_producer_producer.yml')
        self.consumer = SimpleConsumer('configs/elastic/test_elastic_producer_consumer.yml')
Esempio n. 4
0
class TestElasticProducer(object):

    def setup_class(self):
        self.admin = KafkaAdminClient(bootstrap_servers='localhost:9092')
        self.index = ElasticIndex('test-elastic-producer', 'doc')
        self.index.index_into({'test': 1}, 0)
        self.index.index_into({'test': 2}, 1)
        self.index.index_into({'test': 3}, 2)
        self.index.index_into({'test': 4}, 3)
        self.index.index_into({'test': 5}, 4)

        self.producer = ElasticProducer('configs/elastic/test_elastic_producer_producer.yml')
        self.consumer = SimpleConsumer('configs/elastic/test_elastic_producer_consumer.yml')

    def teardown_class(self):
        self.consumer.close()
        self.admin.delete_topics(['test-elastic-producer'])
        self.admin.close()
        self.index.delete()

    #@pytest.mark.skip()
    def test_produce(self):
        self.producer.process()
        key, message = self.consumer.consume()
        assert key == '0'
        assert message == '{"test": 1}'
def enrich(
    index: ElasticIndex, system_number: str
) -> Tuple[Dict[str, int], Optional[str], Optional[List[str]]]:

    query = {
        '_source': ['hits.*', 'doi'],
        'query': {
            'term': {
                '_id': {
                    'value': system_number
                }
            }
        }
    }
    results = index.scan_index(query=query)
    if len(results) == 1:
        logging.debug(results)
        if 'doi' in results[0]:
            doi = results[0]['doi']
        else:
            doi = None
        return results[0]['hits'], doi, []
    else:
        return {
            '2012': 0,
            '2013': 0,
            '2014': 0,
            '2015': 0,
            '2016': 0,
            '2017': 0,
            '2018': 0,
            'total': 0
        }, None, []
Esempio n. 6
0
def enrich(
    index: ElasticIndex, system_number: str
) -> Tuple[Dict[str, Dict[str, int]], Union[List[str], None]]:

    query = {
        '_source': ['bau.*', 'swa.*'],
        'query': {
            'term': {
                '_id': {
                    'value': system_number
                }
            }
        }
    }

    results = index.scan_index(query=query)
    if len(results) == 1:
        return results[0], []
    else:
        return {
            'bau': {
                '2016': 0,
                '2017': 0,
                '2018': 0,
                'total': 0
            },
            'swa': {
                '2016': 0,
                '2017': 0,
                '2018': 0,
                'total': 0
            }
        }, []
Esempio n. 7
0
def enrich(
        index: ElasticIndex,
        system_number: str) -> Tuple[Dict[str, int], Union[List[str], None]]:

    query = {'query': {'term': {'system_number': {'value': system_number}}}}
    hits = len(index.scan_index(query=query))
    identifier = int(system_number)
    if identifier < 320000:
        return {'total': hits}, ['_opac_dual_hit']
    else:
        return {'total': hits}, []
class SimpleElasticConsumer(AbstractBaseConsumer):
    """
    A KafkaConsumer which consumes messages and indexes them into a ElasticIndex one by one.

    Requires the following configs:

        Consumer:
          bootstrap_servers: localhost:9092
          client_id: test
          group_id: elastic-consumer-test
          auto_offset_reset: earliest
        Topics:
          - test
        ElasticIndex:
          index: name-of-index
          doc_type: _doc (default value for elasticsearch 6)
          url: http://localhost:9200
          timeout: 300

    """
    def __init__(self,
                 config,
                 config_class=ElasticConsumerConfig,
                 logger=logging.getLogger(__name__)):
        super().__init__(config, config_class, logger=logger)
        self._index = ElasticIndex(**self.configuration.elastic_settings)

    def consume(self) -> bool:
        """
        Consumes a single message from the subscribed topic and indexes it into the elasticsearch index.

        Returns True if successful, False otherwise.
        """
        message = next(self._consumer)

        key = message.key.decode('utf-8')
        try:
            value = json.loads(message.value.decode('utf-8'))
        except JSONDecodeError as ex:
            value = {
                'message': message.value.decode('utf-8'),
                'error': '{}'.format(ex)
            }
        result = self._index.index_into(value, key)

        if result:
            for assignment in self._consumer.assignment():
                pos = self._consumer.position(assignment)
                if pos != self._consumer.committed(assignment):
                    self._consumer.commit(
                        {assignment: OffsetAndMetadata(pos, "")})
        # self._time_logger.info("Consumed and indexed one message.")
        return result
Esempio n. 9
0
    def upload_to_elastic(self, what: str, identifier='identifier'):
        """
        Uploads a harvest to a elastic search index.

        :param what:        Which harvest it should upload 'ach', 'proj', 'person', 'org', 'pub'
        :param identifier:  What the identifier inside the data is called (default 'identifier')
        """
        data = list()
        for root_dir, _, files in os.walk(self.harvester_info[what][2]):
            for file in files:
                tree = ElementTree.parse(root_dir + '/' + file)
                root = purge_namespaces(tree.getroot())
                for element in root.findall('./ListRecords/record/metadata/'):
                    data.append(json.loads(xml2json(element, 'parker')))

        for item in data:
            clean_data(item, identifier)

        index = ElasticIndex(self.elastic_index + what + '_' + date.today().isoformat(), 'publication',
                             self.elastic_url)
        index.bulk(data, identifier)
Esempio n. 10
0
    def transform_affiliated_publication(self, element, parent, edoc_tag,
                                         index, doc_type, url):
        """Transform affiliated publications in projects.

        Uses the given elastic index to translate a mcss id into a eprints id.

        When a duplicate is found, all eprints Ids are added and the logging is sent to [email protected].
        De-duplication has to be resolved manually.

        When no match is found the mcss id is ignored. (TODO: send to [email protected]?)
        """
        field = parent.find('./' + edoc_tag)
        if field is None:
            field = ET.SubElement(parent, edoc_tag)
        es = ElasticIndex(index, doc_type, url=url)
        query = {
            '_source': ['eprintid'],
            'query': {
                'term': {
                    'mcss_id': {
                        'value': int(element.text)
                    }
                }
            }
        }
        result = es.scan_index(query)
        if len(result) == 1:
            ET.SubElement(field, 'item').text = str(result[0]['eprintid'])
        elif len(result) > 1:
            for e in result:
                ET.SubElement(field, 'item').text = str(e['eprintid'])
            logging.error(
                'Found multiple results with mcss_id %s for project %s %s.',
                element.text, self.current_id, self.current_title)
        else:
            logging.error(
                'Found no eprints ID for the following mcss_id: %s for project %s, %s.',
                element.text, self.current_id, self.current_title)
Esempio n. 11
0
class ElasticProducer(AbstractBaseProducer):
    def __init__(self, config: str):
        super().__init__(config, config_parser=ElasticProducerConfig)
        self._index = ElasticIndex(**self.configuration.elastic_settings)

    def process(self):
        for results in self._index.scroll(**self.configuration.scroll):
            for record in results:
                key: str = record['_id']
                value: str = json.dumps(record['_source'])
                self.send(key.encode('utf-8'), value.encode('utf-8'))

        self.flush()
        self.close()
Esempio n. 12
0
def enrich(index: ElasticIndex, system_number: str, database: str) -> Tuple[Dict[str, Dict[str, int]], Union[List[str], None]]:
    if database == 'dsv01':
        query = {
            '_source': ['reservations.*', 'loans.*'],
            'query': {
                'term': {
                    '_id': {
                        'value': system_number
                    }
                }
            }
        }
        results = index.scan_index(query=query)
        if len(results) == 1:
            return results[0], []
        else:
            return placeholder, ['_no_aleph_data']
    else:
        # place holder values for scripted fields.
        return placeholder, ['_no_aleph_data']
def import_data(plattform: str, year: str):

    index = ElasticIndex("emanus-{}-data-base-{}".format(plattform, year),
                         "doc")

    with open("emanus-{}-{}.json".format(plattform, year), 'r') as fp:
        text = json.load(fp)
        metadata = dict()
        data = list()
        for key in text:
            if key == "data":
                for item in text[key]:
                    result = item
                    result["identifier"] = item["dimensions"]["pagestem"]
                    data.append(result)
            else:
                metadata[key] = text[key]
        index.bulk(data, identifier_key="identifier")
        index.index_into(metadata, 0)
def enrich(
    system_number: str
) -> Tuple[Dict[str, Dict[str, int]], Union[List[str], None]]:
    total = 0
    hits = dict()
    total_sub = 0
    for year in range(2018, 2019):
        sru = ElasticIndex('sru-{}'.format(year),
                           doc_type='logs',
                           url=swissbib_host)
        hits['sru'] = dict()
        query = {'query': {'match': {'requestparams': system_number}}}
        num = len(sru.scan_index(query=query))
        hits['sru'][str(year)] = num
        total += num
        total_sub += num
    hits['sru']['total'] = total_sub

    for source in ['green', 'jus', 'bb']:
        hits[source] = dict()
        total_sub = 0
        for year in range(2017, 2019):
            swissbib = ElasticIndex('swissbib-{}-{}'.format(source, year),
                                    doc_type='logs',
                                    url=swissbib_host)

            query = {
                'query': {
                    'term': {
                        'request_middle.keyword': {
                            'value': system_number
                        }
                    }
                }
            }
            num = len(swissbib.scan_index(query=query))
            hits[source][str(year)] = num
            total += num
            total_sub += num
        hits[source]['total'] = total_sub

    hits['total'] = total
    return hits, []
Esempio n. 15
0
from simple_elastic import ElasticIndex
from kafkaflows.digi.user_data import swissbib, aleph, e_codices, e_manuscripta, e_rara, opac

host = 'http://localhost:9200'

e_codices_index = ElasticIndex('e-codices-data', 'hits', url=host)
e_manuscripta_index = ElasticIndex('e-manuscripta-data', 'hits', url=host)
e_rara_index = ElasticIndex('e-rara-data', 'hits', url=host)
aleph_index = ElasticIndex('aleph-dsv01-data', 'hits', url=host)
opac_index = ElasticIndex('opac-access', 'log', url=host)


def enrich_user_data(config):

    for index in config['indexes']:
        instance = ElasticIndex(**index['index'])

        query = {'query': {'match_all': {}}}

        for results in instance.scroll(query=query):
            for item in results:
                identifier = item['identifier']
                database = item['database']
                sys_number = item['identifiers'][database]

                if 'error_tags' in item:
                    item['error_tags'] = set(item['error_tags'])

                total = 0

                # swissbib
Esempio n. 16
0
from simple_elastic import ElasticIndex
from collections import Counter

from roman import fromRoman, InvalidRomanNumeralError, romanNumeralPattern
import json
import re


find_roman_numeral = re.compile('([MCLXVI]+)[^a-z]')
roman_numeral = re.compile('^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$')

if __name__ == '__main__':
    index = ElasticIndex('kafka*', 'record')

    c = Counter()
    alt_c = Counter()

    query = {
        '_source': ['extent.coverage', 'c-format'],
        'query': {
            'exists': {
                'field': 'extent.coverage'
            }
        }
    }

    missing = open('missing.txt', 'w')
    find_all_result = open('find_all.txt', 'w')

    find_all = list()
    descriptive_coverage = list()
Esempio n. 17
0
 def setup_class(self):
     self.index = ElasticIndex('test')
Esempio n. 18
0
class TestElasticIndex(object):

    def setup_class(self):
        self.index = ElasticIndex('test')

    def teardown_class(self):
        self.index.delete_index()

    def test_scroll(self):
        self.index.index_into({'test': True}, 1)
        self.index.index_into({'test': False}, 2)
        self.index.index_into({'test': True}, 3)
        self.index.index_into({'test': False}, 4)
        for i in self.index.scroll():
            assert isinstance(i, list)

    def test_index_into(self):

        result = self.index.index_into({'test': True, 'object': "This is a string"}, 5)
        assert result
        result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6)
        assert not result
        result = self.index.index_into({'test': False}, 'HAN000827182')
        assert result

    def test_search(self):
        data = list()
        data.append({'id': '1234', 'test': True})
        self.index.bulk(data=data, identifier_key='id')
        result = self.index.search()
        assert len(result) == 7

    def test_search_not_unpack(self):
        result = self.index.search(unpack=False)
        assert len(result) == 7

    def test_alias(self):
        self.index.add_to_alias('test1')
        assert self.index.instance.indices.get_alias('test1')
        self.index.remove_from_alias('test1')
        with pytest.raises(NotFoundError):
            self.index.instance.indices.get_alias('test1')

    def test_count(self):
        result = self.index.count()
        assert result == 7
from openpyxl import load_workbook
from simple_elastic import ElasticIndex

if __name__ == '__main__':

    wb = load_workbook(
        'data/VERZ_DSV01-Ausleihen-Vormerkungen_20180802_bmt.xlsx')
    ws = wb['vor-1900-vormerkungen']

    index = ElasticIndex('aleph-dsv01-data', 'hits')

    all_data = dict()

    for row in ws.iter_rows(min_row=2, min_col=2, max_col=4):
        doc = dict()
        system_number = str(row[0].value)
        while len(system_number) != 9:
            system_number = '0' + system_number

        if system_number not in all_data:
            all_data[system_number] = dict()
        if 'reservations' not in all_data[system_number]:
            all_data[system_number]['reservations'] = dict()
        all_data[system_number]['reservations'][str(
            row[2].value)] = row[1].value

    ws = wb['vor-1900-ausleihen']

    for row in ws.iter_rows(min_row=2, min_col=2, max_col=4):
        doc = dict()
        system_number = str(row[0].value)
Esempio n. 20
0
    def transform_dni_to_contributor(self,
                                     element,
                                     parent,
                                     edoc_tag,
                                     index='',
                                     doc_type='',
                                     url='',
                                     fdb_index='',
                                     fdb_doc_type='',
                                     fdb_url=''):
        """Uses the given DNI to add a full contributor.

        With the given DNI first edoc dataservice will be searched for a match. If found all data from this
        contributor is copied over.

        If the edoc dataservice turns up empty the RDB Persons Database is searched. If found all data is copied over.
        """
        if element.text is not None:
            es = ElasticIndex(index, doc_type, url=url)
            query = {
                '_source': ['contributors'],
                'query': {
                    'term': {
                        'contributors.dni.keyword': {
                            'value': int(element.text)
                        }
                    }
                }
            }
            results = es.scan_index(query)
            if len(results) > 0:
                # returns all contributors. Only add the one with the right DNI.
                for contrib in results[0]['contributors']:
                    if 'dni' in contrib and str(
                            contrib['dni']) == element.text:
                        contributor = parent.find('./contributor')
                        if contributor is None:
                            contributor = ET.SubElement(parent, 'contributor')
                        item = ET.SubElement(contributor, 'item')
                        ET.SubElement(item, 'dni').text = str(
                            contrib['dni']).strip()
                        name = ET.SubElement(item, 'name')
                        ET.SubElement(name, 'given').text = str(
                            contrib['name']['given']).strip()
                        ET.SubElement(name, 'family').text = str(
                            contrib['name']['family']).strip()
                        if 'id' in contrib:
                            ET.SubElement(item, 'id').text = str(
                                contrib['id']).strip()
                        if 'orcid' in contrib:
                            ET.SubElement(item, 'orcid').text = str(
                                contrib['orcid']).strip()
                        if 'unibasChPublicId' in contrib:
                            ET.SubElement(item, 'unibasChPublicId').text = str(
                                contrib['unibasChPublicId']).strip()
            else:  # len(results) == 0
                # try to search it in RDB persons database.
                fdb = ElasticIndex(fdb_index, fdb_doc_type, url=fdb_url)
                query = {
                    'query': {
                        'term': {
                            'dni.keyword': {
                                'value': int(element.text)
                            }
                        }
                    }
                }
                fdb_results = fdb.scan_index(query)
                if len(fdb_results) == 0:
                    self.logger.error('Could not find an author with dni %s.',
                                      element.text)
                elif len(fdb_results) == 1:
                    # in case of a single find => add the contributor to the list.
                    contributor = parent.find('./contributor')
                    if contributor is None:
                        contributor = ET.SubElement(parent, 'contributor')
                    item = ET.SubElement(contributor, 'item')
                    r = fdb_results[0]
                    ET.SubElement(item, 'id').text = r['email'].strip()
                    if 'unibasCHpublicId' in r:
                        ET.SubElement(
                            item,
                            'unibasChPublicId').text = r['unibasCHpublicId']
                    if 'orcid' in r:
                        ET.SubElement(item, 'orcid').text = r['orcid'].strip()
                    ET.SubElement(item, 'dni').text = str(r['dni']).strip()
                    name = ET.SubElement(item, 'name')
                    ET.SubElement(name, 'given').text = r['firstname'].strip()
                    ET.SubElement(name, 'family').text = r['lastname'].strip()
                else:
                    # Should never happen...
                    self.logger.critical(
                        'Found several persons with DNI %s in RDB.',
                        element.text)
        else:
            self.logger.error('A DNI in element %s is None.', self.current_id)
Esempio n. 21
0
class TransformSruExport(DataTransformation):
    def __init__(self, database, config, logger=logging.getLogger(__name__)):
        super().__init__(logger)
        self._database = database
        self._config = config
        self.marc = None
        self.digidata_index = ElasticIndex(**config['digidata'])
        self.page_conversion_rates = config['page-conversions']

    def transform(self, value: str) -> dict:
        # Do not reoder this function!
        self.marc = MARCMapper(value)
        self.marc.add_value('database', self._database)
        self.marc.identifier()

        for field in self.marc.get_fields('024'):
            if field.indicator1 == '7':
                if 'a' in field and '2' in field:
                    self.marc.add_identifier(field['2'], field['a'])

        self.marc.add_identifier('swissbib', self.marc['001'].value())

        if self._database == 'dsv01':
            for _035 in self.marc.get_fields('035'):
                if _035['a'] is not None:
                    if _035['a'].startswith('(IDSBB)'):
                        self.marc.add_identifier('dsv01',
                                                 _035['a'].split(')')[1])
        elif self._database == 'dsv05':
            self.marc.add_identifier('dsv05', self.marc['001'].value()[3:])

        # Do not re-order these!
        self.enrich()

        self.parse_record_type()
        self.parse_date()
        self.parse_format_codes()
        self.parse_number_of_pages()
        self.parse_call_number()

        self.parse_additional_information()

        return self.marc.result

    def enrich(self):
        """Enriching the metadata from other data sources."""
        self.enrich_digidata()

    def enrich_digidata(self):
        """Loads data from the digidata elastic index.

        No live updates, as the digidata repository is on Afrikaportal-elastic,
        which is only on localhost accessible. To update run the digispace-producer & digispace-consumer.

        TODO: Load live data instead of copy. To do this direct access to Afrikaportal is necessary.
        """
        query = {
            "query": {
                "term": {
                    "system_number":
                    self.marc.result['identifiers'][self._database]
                }
            }
        }
        result = self.digidata_index.search(query=query)
        if len(result) > 0:
            self.marc.add_value('is_digitized', True)
            if 'number_of_images' in result[0]:
                self.marc.add_value('number_of_images',
                                    result[0]['number_of_images'])
        else:
            self.marc.add_value('is_digitized', False)

    def parse_date(self):
        """Parsing the date from the various possible fields. Stores where the information was taken from."""
        _008_date = self.marc.parse_date_from_008()
        _046_date = self.marc.parse_date_from_046()
        if _008_date:
            year = self.marc.result['dates']['date']['year']
            self.marc.add_value_sub('final', 'year', int(year))
            self.marc.append_value_sub('final', 'century', int(year / 100) + 1)
            self.marc.append_value_sub('final', 'decade', year - year % 10)
            self.marc.add_value_sub('source', 'year', '008')
        elif _046_date:
            year = self.marc.result['dates']['exact']['year']
            self.marc.add_value_sub('final', 'year', int(year))
            self.marc.append_value_sub('final', 'century', int(year / 100) + 1)
            self.marc.append_value_sub('final', 'decade', year - year % 10)
            self.marc.add_value_sub('source', 'year', '046')
        elif self.marc.parse_date_from_264():
            year = self.marc.result['dates']['parsed_264_year']
            self.marc.add_value_sub('final', 'year', int(year))
            self.marc.append_value_sub('final', 'century', int(year / 100) + 1)
            self.marc.append_value_sub('final', 'decade', year - year % 10)
            self.marc.add_value_sub('source', 'year', '264')
        else:
            self.marc.add_value_sub('source', 'year', 'None')
            self.marc.add_error_tag('_no_valid_date')

    def parse_number_of_pages(self):
        """Figure out the number of pages!

        First source: digidata number of images.
        Second source: coverage
        Third source: estimates.
        """

        self.marc.parse_field_to_subfield('300', 'a', 'extent', 'coverage')
        pages = 0
        name = Units.No
        # This will be filtered anyway.
        if self.marc.result['c-format'] in [
                'Objekt', 'Diverse Tonformate', 'Schallplatte',
                'Diverse Filmformate', 'Datenbank'
        ]:
            pages = 1
            name = Units.Gegenstand
            self.marc.add_value_sub('source', 'pages', 'format')

        if name == Units.No:
            pages, name = self.parse_coverage_field()

            if name == Units.No:
                raise ValueError('Name should not be None here: {}. {}'.format(
                    self.marc.result['identifier'], pages))

            if name != Units.Seiten:
                self.marc.add_value_sub('source', 'pages', 'estimate')
                self.marc.add_value_sub('source', 'estimate', name.value)
                pages = pages * self.page_conversion_rates[name.value]
            else:
                self.marc.add_value_sub('source', 'pages', 'coverage')

            self.marc.add_value_sub('extent', 'pages', pages)

        if 'number_of_images' in self.marc.result:
            pages = self.marc.result['number_of_images']
            self.marc.add_value_sub('source', 'pages', 'digidata')
            if 'estimate' in self.marc.result['source']:
                del self.marc.result['source']['estimate']

        self.marc.add_value_sub('final', 'pages', pages)

    def parse_coverage_field(self) -> Tuple[Union[float, int], Units]:
        """Parses various values from the coverage field and returns them as tuple:

        (number of unit, name of unit)
        """
        if 'coverage' in self.marc.result['extent']:
            coverage = self.marc.result['extent']['coverage']
        else:
            coverage = None
        swissbib_format = self.marc.result['c-format']

        if swissbib_format in ['Klavierauszug', 'Partitur', 'Noten']:
            return self.parse_partituren(coverage)
        elif swissbib_format in ['Atlas', 'Karte', 'Diverse Kartenformate']:
            return self.parse_maps(coverage)
        elif swissbib_format in ['Brief', 'Briefsammlung']:
            return self.parse_letters(coverage)
        elif swissbib_format in ['Diverse Bildformate', 'Fotografie']:
            return self.parse_fotos(coverage)
        elif swissbib_format in [
                'Gesamtwerk', 'Buch', 'Verfassung / Gesetz', 'Artikel'
        ]:
            return self.parse_books(coverage, swissbib_format)
        elif swissbib_format in ['Handschrift']:
            return self.parse_manuscript(coverage)
        elif swissbib_format in ['Dossier']:
            return self.parse_dossier(coverage)
        elif swissbib_format in ['Zeitung', 'Zeitschrift / Schriftenreihe']:
            # TODO: Bessere implementierung von Zeitschriften.
            if coverage is None:
                return 1, Units.Periodikum

            num, name = parse_volumes(coverage, Units.Band)
            if num > 0:
                return num, name

            year = None
            to = None
            if 'dates' in self.marc.result:
                if 'date' in self.marc.result['dates']:
                    if 'year' in self.marc.result['dates']['date']:
                        year = self.marc.result['dates']['date']['year']
                    if 'to' in self.marc.result['dates']['date']:
                        to = self.marc.result['dates']['date']['to']

            if year is not None and to is not None:
                return year - to, Units.Band
            elif year is not None:
                return 1, Units.Band
            else:
                return 1, Units.Periodikum
        else:
            logging.error(
                'Could not parse %s, with coverage %s and format %s.',
                self.marc.result['identifier'], coverage, swissbib_format)
            return 1, Units.Seiten

    def parse_partituren(self,
                         coverage: str) -> Tuple[Union[float, int], Units]:
        if coverage is None or empty.fullmatch(coverage):
            return 1, Units.Partitur

        num, name = parse_pages(coverage)
        if num > 0:
            return num, name

        stimmen = re.match('Stimme', coverage)
        if stimmen:
            return 1, Units.Stimmen

        stimmen = re.match('Stimmen', coverage)
        if stimmen:
            return 3, Units.Stimmen

        num, name = parse_volumes(coverage, Units.Partitur)

        results = re.findall('(\d+) Stimme[n]', coverage)
        for result in results:
            num += int(result[0]) / 2

        results = re.findall('(\d+) (Abt|B|C|H$|He|K|[Pp]art|Ser|T|[Vv]ol)',
                             coverage)
        for result in results:
            num += int(result[0])

        if num > 0:
            return num, Units.Partitur

        num, name = parse_meters(coverage)
        if num > 0:
            return num, name

        return 1, Units.Partitur

    def parse_maps(self, coverage: str) -> Tuple[Union[float, int], Units]:
        if coverage is None or empty.fullmatch(coverage):
            return 4, Units.Karten

        num, name = parse_pages(coverage)
        if name == 'Seiten':
            return num, name

        maps_matches = re.findall(
            '(\d+) ([Kc]arte[n]?|Pl[äa]n[e]?|Vogel|Ansicht|Panorama|Manuskript)',
            coverage)
        maps = 0
        for matches in maps_matches:
            maps += int(matches[0])
        if maps > 0:
            return maps, Units.Karten

        atlas_matches = re.findall('(\d+) (Atlas)', coverage)

        atlas = 0
        for match in atlas_matches:
            atlas += int(match[0])
        if atlas > 0:
            return atlas, Units.Band
        folders, name = parse_folders(coverage, Units.Kartenmappen)
        if folders > 0:
            return folders, name

        return 4, Units.Karten

    def parse_letters(self, coverage: str) -> Tuple[Union[float, int], Units]:
        if coverage is None or empty.fullmatch(coverage):
            return 2, Units.Briefe

        pages, name = parse_pages(coverage)

        results = re.findall(
            '(\d+) (Karte|Briefkarte|Postkarte|Ansichtskarte|Visitenkarte)',
            coverage)
        for result in results:
            pages += int(result[0])

        result = re.match('Briefkarte|Postkarte|Zettel|Karte|Visitenkarte',
                          coverage)
        if result:
            pages += 1

        if pages > 0:
            return pages, Units.Seiten

        letters, name = parse_letters(coverage)
        if letters > 0:
            return letters, name

        volumes, name = parse_volumes(coverage, Units.Briefband)
        if volumes > 0:
            return volumes, name

        folders, name = parse_folders(coverage, Units.Briefmappen)
        if folders > 0:
            return folders, name

        return 2, Units.Briefe

    def parse_fotos(self, coverage: str) -> Tuple[int, Units]:
        if coverage is None or empty.fullmatch(coverage):
            return 1, Units.Seiten

        pages, name = parse_pages(coverage)

        results = re.findall(
            '(\d+) (Kupferstich|Litho|Foto|Zeichnung|Repro|Holzschnitt|Schattenriss'
            '|Aquarell|Druckgrafik(en)?|Physionotrace|Bild|StĂĽck|Radierung)',
            coverage)
        for result in results:
            pages += int(result[0])

        if pages > 0:
            return pages, Units.Seiten

        folders, name = parse_folders(coverage, Units.Fotomappen)
        if folders > 0:
            return folders, name

        return 1, Units.Seiten

    def parse_books(self, coverage: str,
                    swissbib_format: str) -> Tuple[int, Units]:
        if swissbib_format == 'Artikel':
            return_type = Units.Artikel
        else:
            return_type = Units.Band

        if coverage is None or empty.fullmatch(coverage):
            return 1, return_type

        num, name = parse_pages(coverage)
        if num > 0:
            return num, name

        volumes, name = parse_volumes(coverage, return_type)
        if volumes > 0:
            return volumes, name

        return 1, return_type

    def parse_manuscript(self,
                         coverage: str) -> Tuple[Union[float, int], Units]:
        if coverage is None or empty.fullmatch(coverage):
            return 1, Units.Faszikel

        num, name = parse_pages(coverage)
        if num > 0:
            return num, name

        volumes, name = parse_volumes(coverage, Units.Manuskriptband)
        if volumes > 0:
            return volumes, name

        folders, name = parse_folders(coverage, Units.Faszikel)

        results = re.findall(
            '(\d+) (StĂĽcke|PapiertĂĽte[n]?|Faszikel|Dossier|BroschĂĽre|Zeichenbuch|'
            'Heft(e|chen)?|Schuber|Bündel|Konvolut|Schulheft|Umschläge|Büchlein|Umschlag|Predigten)',
            coverage)
        for result in results:
            volumes += int(result[0])

        if folders > 0:
            return folders, Units.Faszikel

        num, name = parse_boxes(coverage)

        if num > 0:
            return num, name

        letters, name = parse_letters(coverage)

        if letters > 0:
            return letters, name

        return 1, Units.Faszikel

    def parse_dossier(self, coverage: str) -> Tuple[Union[int, float], Units]:
        if coverage is None or empty.fullmatch(coverage):
            return 1, Units.Archiveinheit

        pages, name = parse_pages(coverage)
        if pages > 0:
            return pages, name

        volumes, name = parse_volumes(coverage, Units.Band)
        if volumes > 0:
            return volumes, name

        boxes, name = parse_boxes(coverage)
        if boxes > 0:
            return boxes, name

        folders, name = parse_folders(coverage, Units.Mappen)
        if folders > 0:
            return folders, name

        lfm, name = parse_meters(coverage)
        if lfm > 0:
            return lfm, name

        letters, name = parse_letters(coverage)
        if letters > 0:
            return letters, name

        archives, name = parse_archive(coverage, Units.Archiveinheit)
        if archives > 0:
            return archives, name

        return 1, Units.Archiveinheit

    def parse_record_type(self):
        """Defines a general type for the record.

        This is used to distinguish between prints and hand written manuscripts.
        """
        self.marc.parse_field('245', 'h', 'print_material')

        if self._database == 'dsv01':
            self.marc.add_value_sub('final', 'type', 'print')
        elif 'print_material' in self.marc.result:
            if self.marc.result['print_material'] in [
                    'Noten', 'Bildmaterial', 'Druckschrift', 'Kartenmaterial'
            ]:
                self.marc.add_value_sub('final', 'type', 'print')
            elif self.marc.result['print_material'] in [
                    'Ton', 'Mikroform', 'Gegenstand', 'Filmmaterial'
            ]:
                self.marc.add_value_sub('final', 'type', 'other')
            elif self.marc.result['print_material'] in [
                    'Manuskript', 'Notenmanuskript'
            ]:
                self.marc.add_value_sub('final', 'type', 'manuscript')
            else:
                self.marc.add_value_sub('final', 'type', 'other')
                self.marc.add_error_tag('_unknown_print_material')
                logging.warning('Unknown print material: %s in %s.',
                                self.marc.result['print_material'],
                                self.marc.result['identifier'])
        else:
            self.marc.add_value_sub('final', 'type', 'manuscript')

    def parse_call_number(self):
        """Parses the call number of this record has.

        Adds the library it belongs to as well. The call number is further
        indexed in parts to create facets.

        Only books from A100 & A125 are used.

        Books older than 1920 are very rare in A140 (UB Medizin)
        The books in A130 (Altertum) are ignored, because there are not that many, and it would
        be necessary to further filter the books from UBH.

        # TODO: Implement a way to process all the call numbers, since one title
        # can have many of them.
        # currently just picks the first one.
        # books can have multiple call numbers for two reasons:
        # 1. The library owns more than one item.
        # 2. The bibliographic record describes multiple parts of one title.
        """
        for field in self.marc.get_fields('949'):

            if field['F'] in ['A100', 'A125']:
                self.marc.append_value('library', field['F'])
                if field['j'] != '':
                    self.marc.append_value('call_number', field['j'])

        if 'call_number' in self.marc.result:
            results = self.create_call_number_filter()
            if results is not None:
                self.marc.add_value_sub('filter', 'prefix', results[0])
                if results[1] is not None:
                    self.marc.add_value_sub('filter', 'base', results[1])
                if results[2] is not None:
                    self.marc.add_value_sub('filter', 'second', results[2])
                self.marc.add_value_sub('filter', 'number', results[3])

    def create_call_number_filter(
            self) -> Optional[Tuple[str, Optional[str], Optional[str], str]]:
        call_number = ''
        if len(self.marc.result['call_number']) == 1:
            call_number = self.marc.result['call_number'][0]
        else:
            for call_n in self.marc.result['call_number']:
                if call_n is not None:
                    if call_n.startswith('UBH'):
                        call_number = call_n

        call_number = re.sub('\s+', ' ', call_number.strip())

        database = self.marc.result['database']

        if database == 'dsv05' and call_number != '':
            call_number = 'HAN ' + call_number

        if call_number == '':
            # remove call number if it is empty.
            del self.marc.result['call_number']
            return None

        if not re.match('(UBH|HAN)', call_number) or re.fullmatch(
                'UBH', call_number):
            # ignore anything which does not comply with convention.
            return None

        simple = re.fullmatch('(\w+) ([\w\-*.]+) (\d+)(.*)?', call_number)
        if simple:
            return simple.group(1), simple.group(2), None, (
                simple.group(3) + simple.group(4)).strip()

        word_roman = re.fullmatch(
            '(\w+) (\w+) ([MCLXVI]+[ ]?[a-z]?) (\d+)(.*)?', call_number)
        if word_roman:
            return word_roman.group(1), \
                   word_roman.group(2), \
                   word_roman.group(3),  \
                   (word_roman.group(4) + word_roman.group(5)).strip()

        double_word_roman = re.fullmatch(
            '(\w+) ([\w\-*]+) ([\w\-*]+) ([MCLXVI]+[ ]?[a-z]?) (\d+)(.*)?',
            call_number)
        if double_word_roman:
            return double_word_roman.group(1), \
                   double_word_roman.group(2) + ' ' + double_word_roman.group(3), \
                   double_word_roman.group(4), \
                   double_word_roman.group(5)

        three_word = re.fullmatch(
            '(\w+) ([\w\-*]+) ([\w\-*]+) ([A-Za-z\-*]+)(.*)?', call_number)
        if three_word:
            return three_word.group(1), \
                   three_word.group(2) + ' ' + three_word.group(3), \
                   three_word.group(4), \
                   three_word.group(5).strip()

        double_word = re.fullmatch('(\w+) ([\w\-*]+) ([\w\-*]+)(.*)?',
                                   call_number)
        if double_word:
            return double_word.group(1), double_word.group(
                2), double_word.group(3), double_word.group(4).strip()

        rest_han = re.fullmatch('(HAN) (.*)', call_number)
        if rest_han:
            return rest_han.group(1), None, None, rest_han.group(2).strip()

        rest_ubh = re.fullmatch('(UBH) (.*)', call_number)
        if rest_ubh:
            return rest_ubh.group(1), None, None, rest_ubh.group(2).strip()

    def parse_format_codes(self):
        """Parse the format codes and replace them with human readable forms.

        The c-format, the most condensed value is used as format.
        """
        self.marc.parse_field('898', 'a', 'a-format')
        if 'a-format' in self.marc.result:
            self.marc.result['a-format'] = format_dict[
                self.marc.result['a-format']]
        self.marc.parse_field('898', 'b', 'b-format')
        if 'b-format' in self.marc.result:
            self.marc.result['b-format'] = format_dict[
                self.marc.result['b-format']]
        self.marc.parse_field('898', 'c', 'c-format')
        if 'c-format' in self.marc.result:
            self.marc.result['c-format'] = format_dict[
                self.marc.result['c-format']]
            self.marc.add_value_sub('final', 'format',
                                    self.marc.result['c-format'])

    def parse_additional_information(self):
        """Information which might be interesting in the future, but not needed for current analysis."""
        self.marc.parse_leader()

        self.marc.parse_cat_date()

        self.marc.parse_rest_008()

        self.marc.parse_field('245', 'a', 'title')
        self.marc.parse_field('245', 'b', 'subtitle')
        self.marc.parse_field('245', 'c', 'author')

        self.marc.parse_field_to_subfield('264', 'a', 'production', 'place')
        self.marc.parse_field_to_subfield('264', 'b', 'production',
                                          'publisher')
        self.marc.parse_field_to_subfield('264', 'c', 'production', 'date')

        self.marc.parse_field_to_subfield('300', 'b', 'extent',
                                          'physical_attributes')
        self.marc.parse_field_to_subfield('300', 'c', 'extent',
                                          'size_and_format')
        self.marc.parse_field_to_subfield('300', 'e', 'extent',
                                          'additional_content')

        self.marc.parse_field_append_to_subfield('336', 'a', 'extent',
                                                 'content')
        self.marc.parse_field_append_to_subfield('337', 'a', 'extent', 'media')
        self.marc.parse_field_append_to_subfield('338', 'a', 'extent',
                                                 'carrier')
        self.marc.parse_field_to_subfield('348', 'a', 'extent', 'music')

        self.marc.parse_field('351', 'c', 'classification')

        self.marc.parse_field('250', 'a', 'version')

        self.marc.parse_field_to_subfield('340', 'a', 'extent', 'carrier')

        self.marc.parse_field_list([
            '600', '610', '611', '630', '648', '650', '651', '653', '655',
            '690', '691'
        ], {
            'a': 'title',
            '2': 'source',
            '0': 'identifier'
        }, 'subject_headings')

        self.marc.parse_field('856', 'u', 'link')
        self.marc.parse_field_to_subfield('908', 'a', 'extent', 'format')
        self.marc.parse_field('909', 'a', 'archive_tag')

        if 'date' in self.marc.result['production']:
            self.marc.result['final']['display_date'] = self.marc.result[
                'production']['date']

    def pre_filter(self, message: str) -> bool:
        """Keep only records which belong to Universitätsbibliothek Basel."""
        if re.search('{"F": "(A100|A125)"},', message):
            return False
        else:
            return True

    def post_filter(self, transformed_message: dict) -> bool:
        # Remove any record which is newer than 1920.
        if 'year' in transformed_message['final']:
            if int(transformed_message['final']['year']) > 1920:
                return True

        # Remove records of special formats.
        if transformed_message['final']['format'] in [
                'Objekt', 'Diverse Tonformate', 'Schallplatte',
                'Diverse Filmformate', 'Datenbank'
        ]:
            return True

        return False
Esempio n. 22
0
from simple_elastic import ElasticIndex
from collections import Counter
from multiprocessing import Pool
import re
import json

if __name__ == '__main__':
    url_counter = Counter()
    domain_counter = Counter()
    counter = Counter()
    hold = Counter()
    copy = Counter()

    index = ElasticIndex('swissbib-*', 'logs')

    query = {
        '_source': ['search_params.trackurl'],
        'query': {
            'exists': {
                'field': 'search_params.trackurl'
            }
        }
    }

    with open('common-urls.json', 'r') as fp:
        common_urls = json.load(fp)

    pool = Pool(processes=4)
    process_results = list()
    for results in index.scroll(query=query, size=10000):
from simple_elastic import ElasticIndex
import json

if __name__ == '__main__':
    data = ElasticIndex('e-codices-data', 'hits')
    data.dump(".")
Esempio n. 24
0
    with open('suppl/accepted-dois.json', 'w') as fp:
        json.dump(accepted_dois, fp, indent=4, ensure_ascii=False)

    with open('suppl/rejected-dois.json', 'w') as fp:
        json.dump(rejected_dois, fp, indent=4, ensure_ascii=False)

    with open('suppl/call-numbers.json', 'w') as fp:
        json.dump(sorted(list(call_numbers_encoded)),
                  fp,
                  indent=4,
                  ensure_ascii=False)

    with open('suppl/output.json', 'w') as fp:
        json.dump(collect_stems, fp, indent=2, ensure_ascii=False)
    target = ElasticIndex('e-codices-data', 'hits')
    index = ElasticIndex('kafka-dsv05-*', 'record')
    for key in collect_stems:
        if key == '0001':
            # TODO: A combined manuscript.
            continue

        item = dict()

        cn = transform_call_number(key)
        query = {
            '_source': ['call_number', 'identifiers.*'],
            'query': {
                'term': {
                    'call_number.keyword': cn
                }
Esempio n. 25
0
        for v in mapping[sys_number]['vlids']:
            for y in ['2016', '2017', '2018']:
                if sys_number not in result:
                    result[sys_number] = dict()
                if p not in result[sys_number]:
                    result[sys_number][p] = dict()
                if v in vlids[p][y]:
                    if y not in result[sys_number]:
                        result[sys_number][p][y] = vlids[p][y][v]['page-views']
                    else:
                        result[sys_number][p][y] += vlids[p][y][v][
                            'page-views']
                else:
                    if y not in result[sys_number][p]:
                        result[sys_number][p][y] = 0

    elastic_data = list()
    for sys_number in result:
        item = dict()
        item['bau'] = dict()
        total = 0
        for y in result[sys_number]['erara-bau']:
            item['bau'][y] = result[sys_number]['erara-bau'][y]
            total += item['bau'][y]
        item['bau']['total'] = total
        item['identifier'] = sys_number
        elastic_data.append(item)

    index = ElasticIndex('e-rara-data', 'hits')
    index.bulk(elastic_data, 'identifier')
class BulkElasticConsumer(AbstractBaseConsumer):
    """
    Will attempt to collect a number of messages and then bulk index them. Collection will either wait some time or
    collect 10'000 messages.


    Consumer:
      bootstrap_servers: localhost:9092
      client_id: test
      group_id: elastic-consumer-test
      auto_offset_reset: earliest
    Topics:
      - test
    ElasticIndex:
      index: name-of-index
      doc_type: _doc (default value for elasticsearch 6)
      url: http://localhost:9200
      timeout: 300
    IdentifierKey: name-of-key-value (optional, if not specified the Kafka key value will be used.)
    """
    def __init__(self,
                 config,
                 config_class=ElasticConsumerConfig,
                 logger=logging.getLogger(__name__)):
        super().__init__(config, config_class, logger=logger)
        self._index = ElasticIndex(**self.configuration.elastic_settings)
        self._key = self.configuration.key

    @property
    def configuration(self) -> ElasticConsumerConfig:
        return super().configuration

    def consume(self) -> bool:
        data = list()
        messages = self._consumer.poll(100, 10000)
        if messages:
            # TODO: Only works if there is a single partition per consumer. As soon as the number of consumers is lower
            # TODO: or higher than the number of partitions this fails.
            for message in messages[self._consumer.assignment().pop()]:
                key = message.key.decode('utf-8')
                try:
                    value = json.loads(message.value.decode('utf-8'))
                except JSONDecodeError as ex:
                    self._error_logger.error(
                        "Failed to JSONDecode message: {}.".format(
                            message.value.decode('utf-8')))
                    value = {
                        'message': message.value.decode('utf-8'),
                        'error': '{}'.format(ex)
                    }
                if self._key not in value:
                    value['_key'] = key
                data.append(value)
        now = time.time()
        if len(data) > 0:
            result = self._index.bulk(data,
                                      self._key,
                                      op_type=self.configuration.op_type,
                                      upsert=self.configuration.upsert)
            then = time.time()
            amount = then - now
            self._time_logger.info(
                "Success! Indexed {} messages to {} in {} seconds.".format(
                    len(data), self._index.index, amount))
        else:
            result = False

        if result:
            for assignment in self._consumer.assignment():
                pos = self._consumer.position(assignment)
                if pos != self._consumer.committed(assignment):
                    self._consumer.commit(
                        {assignment: OffsetAndMetadata(pos, "")})

        return result
from simple_elastic import ElasticIndex

from datetime import date

dsv01_full_export = ElasticIndex('dsv01-sys-numbers-before-1900', 'record')

with open('data/dsv01_system_numbers_vor_1900_arc_export_20180802.csv',
          'r',
          encoding='utf-16') as file:
    for line in file:
        year, sys_number = line.split(',')
        doc = dict()
        while len(sys_number) != 10:
            sys_number = '0' + sys_number
        doc['system_number'] = sys_number.strip()
        doc['publication_date'] = year
        doc['index_date'] = date.today().isoformat()
        dsv01_full_export.index_into(doc, doc['system_number'])
Esempio n. 28
0
def enrich_user_data(config):

    for index in config['indexes']:
        instance = ElasticIndex(**index['index'])

        query = {'query': {'match_all': {}}}

        for results in instance.scroll(query=query):
            for item in results:
                identifier = item['identifier']
                database = item['database']
                sys_number = item['identifiers'][database]

                if 'error_tags' in item:
                    item['error_tags'] = set(item['error_tags'])

                total = 0

                # swissbib
                hits, error_tags = swissbib.enrich(identifier)
                item['hits']['swissbib'] = hits
                total += hits['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                # opac
                hits, error_tags = opac.enrich(opac_index, sys_number)
                item['hits']['opac-access'] = hits
                total += hits['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                # aleph
                hits, error_tags = aleph.enrich(aleph_index, sys_number,
                                                database)
                item['hits']['aleph'] = hits
                total += hits['loans']['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                if database == 'dsv05':
                    # e-rara
                    hits, error_tags = e_rara.enrich(e_rara_index, sys_number)
                    item['hits']['e-rara'] = hits
                    total += hits['bau']['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    # e-manuscripta
                    hits, error_tags = e_manuscripta.enrich(
                        e_manuscripta_index, sys_number)
                    item['hits']['e-manuscripta'] = hits
                    total += hits['bau']['total']
                    total += hits['swa']['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    # e-codices
                    hits, doi, error_tags = e_codices.enrich(
                        e_codices_index, sys_number)
                    item['hits']['e-codices'] = hits
                    total += hits['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    if doi is not None:
                        if 'doi' in item['identifiers']:
                            if isinstance(item['identifiers']['doi'], list):
                                item['identifiers']['doi'].append(doi)
                            else:
                                item['identifiers']['doi'] = [
                                    item['identifiers']['doi'], doi
                                ]

                # e-mails dsv05
                # TODO

                item['error_tags'] = list(item['error_tags'])

                item['hits']['total'] = total

                instance.index_into(item, item['identifier'])
Esempio n. 29
0
                index_object = captures
                index_object['identifier'] = identifier
                identifier += 1

                date = re.match(
                    '(?P<day>\d+)\/(?P<month>\w+)\/(?P<year>\d+):'
                    '(?P<hour>\d+):(?P<minute>\d+):(?P<second>\d+) .*$',
                    captures['timestamp'])
                if date:
                    dates = date.groupdict()
                    index_object['date_parts'] = dates
                else:
                    logging.warning('Could not parse date time in line "%s".',
                                    line)
            else:
                logging.warning('Could not parse line "%s".', line)

            if index_object is not None:
                index_objects.append(index_object)

            count += 1

            if count == 500:
                index = ElasticIndex('opac-access', 'log')
                index.bulk(index_objects, identifier_key='identifier')
                count = 0
                index_objects.clear()

        index = ElasticIndex('opac-access', 'log')
        index.bulk(index_objects, identifier_key='identifier')
from simple_elastic import ElasticIndex
from collections import Counter
import json

if __name__ == '__main__':
    c = Counter()

    index = ElasticIndex('kafka*', 'record')
    with open('data/collected-hits-e-plattforms.json', 'r') as fp:
        data = json.load(fp)
        for sys_number in data:
            query = {
                'query': {
                    'bool': {
                        'should': [{
                            'match': {
                                'identifiers.dsv05': sys_number
                            }
                        }, {
                            'match': {
                                'identifiers.dsv01': sys_number
                            }
                        }],
                        'minimum_should_match':
                        1
                    }
                }
            }
            results = index.scan_index(query=query)

            if len(results) == 1: