Example #1
0
    def __init__(self, bib_object, work, manifestation, expression, buffer):

        # attributes for item_es_index
        self.mock_es_id = str(
            esid.POLONA_ITEM_PREFIX +
            to_single_value(get_values_by_field(bib_object, '001'))[1:])
        self.expression_ids = [str(expression.mock_es_id)]
        self.item_count = 1
        self.item_local_bib_id = str(
            to_single_value(get_values_by_field(bib_object, '001')))
        self.item_local_id = str(
            to_single_value(
                get_values_by_field_and_subfield(bib_object, ('856', ['u']))))
        self.item_mat_id = int(manifestation.mock_es_id)
        self.item_url = str(
            to_single_value(
                get_values_by_field_and_subfield(bib_object, ('856', ['u']))))
        self.item_work_id = int(work.mock_es_id)
        self.library = {
            'digital': True,
            'name': 'Polona.pl',
            'id': 10945
        }  # hardcoded - always the same
        self.metadata_original = str(uuid4())  # some random fake uuid
        self.metadata_source = 'REFERENCE'
        self.modification_time = '2019-10-11T17:45:21.527'  # fake time
        self.phrase_suggest = ['-']
        self.suggest = ['-']
        self.work_ids = [str(work.mock_es_id)]
        self.write_to_dump_file(buffer)
Example #2
0
    def get_pub_country(self, bib_object, code_val_index):
        pub_008 = get_values_by_field(bib_object, '008')[0][15:18]
        pub_008 = pub_008[:-1] if pub_008[-1] == ' ' else pub_008
        pub_044_a = get_values_by_field_and_subfield(bib_object,
                                                     ('044', ['a']))

        country_codes = set()

        country_codes.add(pub_008)
        country_codes.update(pub_044_a)

        self.mat_pub_country.extend(
            resolve_code_and_serialize(list(country_codes), 'country_dict',
                                       code_val_index))
Example #3
0
    def __init__(self, bib_object, work, manifestation, expression, buffer):

        # attributes for item_es_index
        self.mock_es_id = str(
            esid.BN_ITEM_PREFIX +
            to_single_value(get_values_by_field(bib_object, '001'))[1:])
        self.expression_ids = [str(expression.mock_es_id)]
        self.item_call_number = get_values_by_field_and_subfield(
            bib_object, ('852', ['h']))
        self.item_count = len(get_values_by_field(bib_object, '852'))
        self.item_deleted_id = []
        self.item_local_bib_id = str(
            to_single_value(get_values_by_field(bib_object, '001')))
        self.item_local_id = postprocess(
            str, get_values_by_field_and_subfield(bib_object, ('852', ['8'])))
        self.item_location = str(
            to_single_value(
                get_values_by_field_and_subfield(bib_object, ('852', ['c']))))
        self.item_mat_id = int(manifestation.mock_es_id)
        self.item_source = 'DATABN'  # fake source
        self.item_status = 'false'  # fake status
        self.item_url = f'https://katalogi.bn.org.pl/discovery/fulldisplay?docid=alma' \
                        f'{str(to_single_value(get_values_by_field(bib_object, "009")))}' \
                        f'&context=L&vid=48OMNIS_NLOP:48OMNIS_NLOP'
        self.item_work_id = int(work.mock_es_id)
        self.library = {
            'digital': False,
            'name': 'Biblioteka Narodowa',
            'id': 10947
        }  # hardcoded - always the same
        self.metadata_original = str(uuid4())  # some random fake uuid
        self.metadata_source = 'REFERENCE'
        self.modification_time = '2019-10-11T17:45:21.527'  # fake time
        self.phrase_suggest = ['-']
        self.suggest = ['-']
        self.work_ids = [str(work.mock_es_id)]
        self.write_to_dump_file(buffer)
Example #4
0
def is_book_ebook_audiobook(pymarc_object):
    val_380a = get_values_by_field_and_subfield(pymarc_object, ('380', ['a']))
    val_ldr67 = pymarc_object.leader[6:8]

    values_380a_to_check = ['Książki', 'Audiobooki', 'E-booki']
    values_ldr67_to_check = ['am', 'im']

    if val_ldr67 in values_ldr67_to_check:
        for value in values_380a_to_check:
            if value in val_380a:
                return True
        else:
            return False
    else:
        return False
Example #5
0
 def instantiate_polona_items(self, bib_object, work, expression, buffer):
     list_856_uz = get_values_by_field_and_subfield(bib_object,
                                                    ('856', ['u', 'z']))
     if list_856_uz and 'Polonie' in to_single_value(list_856_uz):
         i_mock_es_id = str(
             esid.POLONA_ITEM_PREFIX +
             to_single_value(get_values_by_field(bib_object, '001'))[1:])
         i = PolonaItem(bib_object, work, self, expression, buffer)
         self.item_ids.append(int(i_mock_es_id))
         self.stat_item_count += i.item_count
         self.stat_digital_library_count = 1
         self.stat_digital = True
         self.stat_public_domain = True
         print('Instantiated polona item!')
         return i
Example #6
0
    def add(self, bib_object, work, buffer, descr_index, code_val_index):
        if not self.mock_es_id:
            self.mock_es_id = str(
                esid.EXPRESSION_PREFIX +
                get_values_by_field(bib_object, '001')[0][1:])
        if not self.expr_form:
            self.expr_form = serialize_to_jsonl_descr(
                resolve_field_value(
                    get_values_by_field_and_subfield(bib_object,
                                                     ('380', ['a'])),
                    descr_index))
        if not self.expr_lang:
            self.expr_lang = [get_values_by_field(bib_object, '008')[0][35:38]]
            self.expr_lang = resolve_code_and_serialize(
                self.expr_lang, 'language_dict', code_val_index)
        if not self.expr_leader_type:
            self.expr_leader_type = bib_object.leader[6]
        if not self.expr_title:
            self.expr_title = postprocess(
                truncate_title_proper,
                get_values_by_field_and_subfield(bib_object,
                                                 ('245', ['a', 'b'])))[0]
        if not self.work_ids:
            self.work_ids = [int(work.mock_es_id)]
        if not self.expr_work:
            self.expr_work = {
                'id': int(work.mock_es_id),
                'type': 'work',
                'value': str(work.mock_es_id)
            }

        self.materialization_ids.append(
            int(esid.MANIFESTATION_PREFIX +
                get_values_by_field(bib_object, '001')[0][1:]))
        self.instantiate_manifestation(bib_object, work, buffer, descr_index,
                                       code_val_index)
Example #7
0
def is_single_work(pymarc_object):
    # each and every record MUST have these fields, if it hasn't, it should be treated as invalid and skipped
    try:
        val_245a_last_char = get_values_by_field_and_subfield(
            pymarc_object, ('245', ['a']))[0][-1]
        val_245a = get_values_by_field_and_subfield(pymarc_object,
                                                    ('245', ['a']))[0]
        val_245c = get_values_by_field_and_subfield(pymarc_object,
                                                    ('245', ['c']))[0]
    except IndexError:
        logging.debug('Invalid record.')
        return False

    list_val_245b = get_values_by_field_and_subfield(pymarc_object,
                                                     ('245', ['b']))
    val_245b = list_val_245b[0] if list_val_245b else ''

    list_val_730 = get_values_by_field(pymarc_object, '730')
    list_val_501 = get_values_by_field(pymarc_object, '501')
    list_val_505 = get_values_by_field(pymarc_object, '505')
    list_val_740 = get_values_by_field(pymarc_object, '740')
    list_val_700t = get_values_by_field_and_subfield(pymarc_object,
                                                     ('700', ['t']))
    list_val_710t = get_values_by_field_and_subfield(pymarc_object,
                                                     ('710', ['t']))
    list_val_711t = get_values_by_field_and_subfield(pymarc_object,
                                                     ('711', ['t']))
    list_val_246i = get_values_by_field_and_subfield(pymarc_object,
                                                     ('246', ['i']))

    is_2_1_1_1 = val_245a_last_char != ';' and ' ; ' not in val_245a and ' ; ' not in val_245b and ' / ' not in val_245c
    is_2_1_1_2 = True if not list_val_730 or (
        len(list_val_730) == 1
        and 'Katalog wystawy' in list_val_730[0]) else False
    is_2_1_1_3 = True if not list_val_501 and not list_val_505 and not list_val_740 else False
    is_2_1_1_4 = True if not list_val_700t and not list_val_710t and not list_val_711t else False
    is_2_1_1_5 = True if len([
        x for x in list_val_246i if 'Tyt. oryg.' in x or 'Tytuł oryginału' in x
    ]) < 2 else False

    if is_2_1_1_1 and is_2_1_1_2 and is_2_1_1_3 and is_2_1_1_4 and is_2_1_1_5:
        return True
    else:
        return False
Example #8
0
def get_titles_for_manifestation_matching(pymarc_object):
    titles_to_index_490 = set()
    titles_to_index_245 = set()

    title_245 = pymarc_object.get_fields('245')[0]
    titles_490 = get_values_by_field_and_subfield(pymarc_object,
                                                  ('490', ['a']))

    title_245_no_offset = ' '.join(
        sf for sf in title_245.get_subfields('a', 'b'))[:25]
    title_245_with_offset = ' '.join(sf for sf in title_245.get_subfields(
        'a', 'b'))[int(title_245.indicators[1]):25]

    titles_to_index_490.update(titles_490)
    titles_to_index_245.add(title_245_no_offset)
    titles_to_index_245.add(title_245_with_offset)

    return {
        'titles_245': list(titles_to_index_245),
        'titles_490': list(titles_to_index_490)
    }
Example #9
0
def get_data_for_matching(manifestation):
    ldr_67 = manifestation.leader[6:8]
    val_008_0614 = get_values_by_field(manifestation,
                                       '008')[0][6:15].replace('+', ' ')
    isbn_020_az = get_values_by_field_and_subfield(manifestation,
                                                   ('020', ['a', 'z']))
    title_245 = get_values_by_field_and_subfield(manifestation,
                                                 ('245', ['a', 'b']))[0]
    title_245_no_offset = ' '.join(sf for sf in manifestation.get_fields('245')
                                   [0].get_subfields('a', 'b'))[:25]
    title_245_with_offset = ' '.join(
        sf
        for sf in manifestation.get_fields('245')[0].get_subfields('a', 'b')
    )[int(manifestation.get_fields('245')[0].indicators[1]):25]
    titles_490 = get_values_by_field_and_subfield(manifestation,
                                                  ('490', ['a']))

    numbers_from_title_245 = ''.join(gr for gr in re.findall('\d', title_245))
    place_pub_260_a_first_word = get_values_by_field_and_subfield(
        manifestation, ('260', ['a']))[0].split()[0]
    num_of_pages_300_a = max(
        int(gr) for gr in re.findall(
            '\d+',
            get_values_by_field_and_subfield(manifestation, ('300',
                                                             ['a']))[0]))
    b_format = int(
        re.search(
            '\d+',
            get_values_by_field_and_subfield(manifestation,
                                             ('300', ['c']))[0])[0])
    edition = postprocess(normalize_edition_for_matching,
                          get_values_by_field(manifestation, '250'))

    return ManifMatchData(
        ldr_67=ldr_67,
        val_008_0614=val_008_0614,
        isbn_020_az=isbn_020_az,
        title_245=title_245,
        title_245_no_offset=title_245_no_offset,
        title_245_with_offset=title_245_with_offset,
        titles_490=titles_490,
        numbers_from_title_245=numbers_from_title_245,
        place_pub_260_a_first_word=place_pub_260_a_first_word,
        num_of_pages_300_a=num_of_pages_300_a,
        b_format=b_format,
        edition=edition)
Example #10
0
    def __init__(self, bib_object, work, expression, buffer, descr_index,
                 code_val_index):
        # attributes for manifestation_es_index
        self.mock_es_id = str(
            esid.MANIFESTATION_PREFIX +
            to_single_value(get_values_by_field(bib_object, '001'))[1:])

        self.eForm = only_values(
            resolve_field_value(
                get_values_by_field_and_subfield(bib_object, ('380', ['a'])),
                descr_index))
        self.expression_ids = [int(expression.mock_es_id)]
        self.item_ids = [
        ]  # populated after instantiating all the manifestations and mak+ matching
        self.libraries = [
        ]  # populated after instantiating all the manifestations and mak+ matching
        self.mat_carrier_type = resolve_code_and_serialize(
            get_values_by_field_and_subfield(bib_object, ('338', ['b'])),
            'carrier_type_dict', code_val_index)
        self.mat_contributor = []
        self.mat_digital = False
        self.mat_edition = get_values_by_field(bib_object, '250')
        self.mat_external_id = get_values_by_field_and_subfield(
            bib_object, ('035', ['a']))
        self.mat_isbn = get_values_by_field_and_subfield(
            bib_object, ('020', ['a']))
        self.mat_matching_title = ''  # todo
        self.mat_material_type = ''  # todo
        self.mat_media_type = resolve_code_and_serialize(
            get_values_by_field_and_subfield(bib_object, ('337', ['b'])),
            'media_type_dict', code_val_index)
        self.mat_nat_bib = []  # todo
        self.mat_nlp_id = to_single_value(
            get_values_by_field(bib_object, '001'))
        self.mat_note = []  # todo
        self.mat_number_of_pages = to_single_value(
            get_values_by_field_and_subfield(bib_object, ('300', ['a'])))
        self.mat_physical_info = get_values_by_field(bib_object, '300')
        self.mat_pub_city = get_values_by_field_and_subfield(
            bib_object, ('260', ['a']))
        self.mat_pub_country = []
        self.get_pub_country(bib_object, code_val_index)
        self.mat_pub_date_from = None
        self.mat_pub_date_single = None
        self.mat_pub_date_to = None
        self.get_mat_pub_dates(bib_object)
        self.mat_pub_info = get_values_by_field(bib_object, '260')
        self.mat_publisher = []
        self.get_publishers_all(bib_object)
        self.mat_publisher_uniform = []
        self.get_uniform_publishers(bib_object, descr_index)
        self.mat_title_and_resp = get_values_by_field(bib_object, '245')
        self.mat_title_other_info = []  # todo
        self.mat_title_proper = to_single_value(
            postprocess(
                truncate_title_proper,
                get_values_by_field_and_subfield(bib_object,
                                                 ('245', ['a', 'b']))))
        self.mat_title_variant = get_values_by_field_and_subfield(
            bib_object, ('246', ['a', 'b']))
        self.metadata_original = str(uuid4())
        self.metadata_source = 'REFERENCE'
        self.modificationTime = "2019-10-01T13:34:23.580"
        self.phrase_suggest = [self.mat_title_proper]  # todo
        self.popularity_join = "owner"
        self.stat_digital = False
        self.stat_digital_library_count = 0
        self.stat_item_count = 0
        self.stat_library_count = 0
        self.stat_public_domain = False
        self.suggest = [self.mat_title_proper]  # todo
        self.work_creator = []
        self.work_creators = []
        self.get_work_creators(work)
        self.get_mat_contributors(bib_object, code_val_index, descr_index)
        self.work_ids = [int(work.mock_es_id)]

        self.bn_items = [
            self.instantiate_bn_items(bib_object, work, expression, buffer)
        ]
        self.polona_items = [
            self.instantiate_polona_items(bib_object, work, expression, buffer)
        ]
        self.mak_items = {}
Example #11
0
    def get_publishers_all(self, bib_object):
        pl = get_values_by_field_and_subfield(bib_object, ('260', ['b']))
        publishers_list = postprocess(normalize_publisher, pl)

        self.mat_publisher = publishers_list