def get_mat_pub_dates(self, bib_object): v_008_06 = get_values_by_field(bib_object, '008')[0][6] if v_008_06 in ['r', 's', 'p', 't']: v_008_0710 = get_values_by_field( bib_object, '008')[0][7:11].replace('u', '0').replace(' ', '0').replace('X', '0') try: self.mat_pub_date_single = int(v_008_0710) except ValueError: pass else: v_008_0710 = get_values_by_field( bib_object, '008')[0][7:11].replace('u', '0').replace(' ', '0').replace('X', '0') v_008_1114 = get_values_by_field( bib_object, '008')[0][11:15].replace('u', '0').replace(' ', '0').replace('X', '0') try: self.mat_pub_date_from = int(v_008_0710) self.mat_pub_date_to = int(v_008_1114) except ValueError: pass
def __init__(self, bib_object, work, manifestation, expression, buffer): # attributes for item_es_index self.mock_es_id = str( esid.POLONA_ITEM_PREFIX + to_single_value(get_values_by_field(bib_object, '001'))[1:]) self.expression_ids = [str(expression.mock_es_id)] self.item_count = 1 self.item_local_bib_id = str( to_single_value(get_values_by_field(bib_object, '001'))) self.item_local_id = str( to_single_value( get_values_by_field_and_subfield(bib_object, ('856', ['u'])))) self.item_mat_id = int(manifestation.mock_es_id) self.item_url = str( to_single_value( get_values_by_field_and_subfield(bib_object, ('856', ['u'])))) self.item_work_id = int(work.mock_es_id) self.library = { 'digital': True, 'name': 'Polona.pl', 'id': 10945 } # hardcoded - always the same self.metadata_original = str(uuid4()) # some random fake uuid self.metadata_source = 'REFERENCE' self.modification_time = '2019-10-11T17:45:21.527' # fake time self.phrase_suggest = ['-'] self.suggest = ['-'] self.work_ids = [str(work.mock_es_id)] self.write_to_dump_file(buffer)
def get_data_for_matching(manifestation): ldr_67 = manifestation.leader[6:8] val_008_0614 = get_values_by_field(manifestation, '008')[0][6:15].replace('+', ' ') isbn_020_az = get_values_by_field_and_subfield(manifestation, ('020', ['a', 'z'])) title_245 = get_values_by_field_and_subfield(manifestation, ('245', ['a', 'b']))[0] title_245_no_offset = ' '.join(sf for sf in manifestation.get_fields('245') [0].get_subfields('a', 'b'))[:25] title_245_with_offset = ' '.join( sf for sf in manifestation.get_fields('245')[0].get_subfields('a', 'b') )[int(manifestation.get_fields('245')[0].indicators[1]):25] titles_490 = get_values_by_field_and_subfield(manifestation, ('490', ['a'])) numbers_from_title_245 = ''.join(gr for gr in re.findall('\d', title_245)) place_pub_260_a_first_word = get_values_by_field_and_subfield( manifestation, ('260', ['a']))[0].split()[0] num_of_pages_300_a = max( int(gr) for gr in re.findall( '\d+', get_values_by_field_and_subfield(manifestation, ('300', ['a']))[0])) b_format = int( re.search( '\d+', get_values_by_field_and_subfield(manifestation, ('300', ['c']))[0])[0]) edition = postprocess(normalize_edition_for_matching, get_values_by_field(manifestation, '250')) return ManifMatchData( ldr_67=ldr_67, val_008_0614=val_008_0614, isbn_020_az=isbn_020_az, title_245=title_245, title_245_no_offset=title_245_no_offset, title_245_with_offset=title_245_with_offset, titles_490=titles_490, numbers_from_title_245=numbers_from_title_245, place_pub_260_a_first_word=place_pub_260_a_first_word, num_of_pages_300_a=num_of_pages_300_a, b_format=b_format, edition=edition)
def is_single_work(pymarc_object): # each and every record MUST have these fields, if it hasn't, it should be treated as invalid and skipped try: val_245a_last_char = get_values_by_field_and_subfield( pymarc_object, ('245', ['a']))[0][-1] val_245a = get_values_by_field_and_subfield(pymarc_object, ('245', ['a']))[0] val_245c = get_values_by_field_and_subfield(pymarc_object, ('245', ['c']))[0] except IndexError: logging.debug('Invalid record.') return False list_val_245b = get_values_by_field_and_subfield(pymarc_object, ('245', ['b'])) val_245b = list_val_245b[0] if list_val_245b else '' list_val_730 = get_values_by_field(pymarc_object, '730') list_val_501 = get_values_by_field(pymarc_object, '501') list_val_505 = get_values_by_field(pymarc_object, '505') list_val_740 = get_values_by_field(pymarc_object, '740') list_val_700t = get_values_by_field_and_subfield(pymarc_object, ('700', ['t'])) list_val_710t = get_values_by_field_and_subfield(pymarc_object, ('710', ['t'])) list_val_711t = get_values_by_field_and_subfield(pymarc_object, ('711', ['t'])) list_val_246i = get_values_by_field_and_subfield(pymarc_object, ('246', ['i'])) is_2_1_1_1 = val_245a_last_char != ';' and ' ; ' not in val_245a and ' ; ' not in val_245b and ' / ' not in val_245c is_2_1_1_2 = True if not list_val_730 or ( len(list_val_730) == 1 and 'Katalog wystawy' in list_val_730[0]) else False is_2_1_1_3 = True if not list_val_501 and not list_val_505 and not list_val_740 else False is_2_1_1_4 = True if not list_val_700t and not list_val_710t and not list_val_711t else False is_2_1_1_5 = True if len([ x for x in list_val_246i if 'Tyt. oryg.' in x or 'Tytuł oryginału' in x ]) < 2 else False if is_2_1_1_1 and is_2_1_1_2 and is_2_1_1_3 and is_2_1_1_4 and is_2_1_1_5: return True else: return False
def instantiate_bn_items(self, bib_object, work, expression, buffer): list_852_fields = bib_object.get_fields('852') if list_852_fields: i_mock_es_id = str( esid.BN_ITEM_PREFIX + to_single_value(get_values_by_field(bib_object, '001'))[1:]) i = BnItem(bib_object, work, self, expression, buffer) self.item_ids.append(int(i_mock_es_id)) self.stat_item_count += i.item_count return i
def get_pub_country(self, bib_object, code_val_index): pub_008 = get_values_by_field(bib_object, '008')[0][15:18] pub_008 = pub_008[:-1] if pub_008[-1] == ' ' else pub_008 pub_044_a = get_values_by_field_and_subfield(bib_object, ('044', ['a'])) country_codes = set() country_codes.add(pub_008) country_codes.update(pub_044_a) self.mat_pub_country.extend( resolve_code_and_serialize(list(country_codes), 'country_dict', code_val_index))
def __init__(self, bib_object, work, manifestation, expression, buffer): # attributes for item_es_index self.mock_es_id = str( esid.BN_ITEM_PREFIX + to_single_value(get_values_by_field(bib_object, '001'))[1:]) self.expression_ids = [str(expression.mock_es_id)] self.item_call_number = get_values_by_field_and_subfield( bib_object, ('852', ['h'])) self.item_count = len(get_values_by_field(bib_object, '852')) self.item_deleted_id = [] self.item_local_bib_id = str( to_single_value(get_values_by_field(bib_object, '001'))) self.item_local_id = postprocess( str, get_values_by_field_and_subfield(bib_object, ('852', ['8']))) self.item_location = str( to_single_value( get_values_by_field_and_subfield(bib_object, ('852', ['c'])))) self.item_mat_id = int(manifestation.mock_es_id) self.item_source = 'DATABN' # fake source self.item_status = 'false' # fake status self.item_url = f'https://katalogi.bn.org.pl/discovery/fulldisplay?docid=alma' \ f'{str(to_single_value(get_values_by_field(bib_object, "009")))}' \ f'&context=L&vid=48OMNIS_NLOP:48OMNIS_NLOP' self.item_work_id = int(work.mock_es_id) self.library = { 'digital': False, 'name': 'Biblioteka Narodowa', 'id': 10947 } # hardcoded - always the same self.metadata_original = str(uuid4()) # some random fake uuid self.metadata_source = 'REFERENCE' self.modification_time = '2019-10-11T17:45:21.527' # fake time self.phrase_suggest = ['-'] self.suggest = ['-'] self.work_ids = [str(work.mock_es_id)] self.write_to_dump_file(buffer)
def instantiate_polona_items(self, bib_object, work, expression, buffer): list_856_uz = get_values_by_field_and_subfield(bib_object, ('856', ['u', 'z'])) if list_856_uz and 'Polonie' in to_single_value(list_856_uz): i_mock_es_id = str( esid.POLONA_ITEM_PREFIX + to_single_value(get_values_by_field(bib_object, '001'))[1:]) i = PolonaItem(bib_object, work, self, expression, buffer) self.item_ids.append(int(i_mock_es_id)) self.stat_item_count += i.item_count self.stat_digital_library_count = 1 self.stat_digital = True self.stat_public_domain = True print('Instantiated polona item!') return i
def add(self, bib_object, work, buffer, descr_index, code_val_index): if not self.mock_es_id: self.mock_es_id = str( esid.EXPRESSION_PREFIX + get_values_by_field(bib_object, '001')[0][1:]) if not self.expr_form: self.expr_form = serialize_to_jsonl_descr( resolve_field_value( get_values_by_field_and_subfield(bib_object, ('380', ['a'])), descr_index)) if not self.expr_lang: self.expr_lang = [get_values_by_field(bib_object, '008')[0][35:38]] self.expr_lang = resolve_code_and_serialize( self.expr_lang, 'language_dict', code_val_index) if not self.expr_leader_type: self.expr_leader_type = bib_object.leader[6] if not self.expr_title: self.expr_title = postprocess( truncate_title_proper, get_values_by_field_and_subfield(bib_object, ('245', ['a', 'b'])))[0] if not self.work_ids: self.work_ids = [int(work.mock_es_id)] if not self.expr_work: self.expr_work = { 'id': int(work.mock_es_id), 'type': 'work', 'value': str(work.mock_es_id) } self.materialization_ids.append( int(esid.MANIFESTATION_PREFIX + get_values_by_field(bib_object, '001')[0][1:])) self.instantiate_manifestation(bib_object, work, buffer, descr_index, code_val_index)
def has_items(pymarc_object): return True if get_values_by_field(pymarc_object, '852') else False
def main_loop(configuration: dict): indexed_works_by_uuid = {} indexed_works_by_titles = {} indexed_works_by_mat_nlp_id = {} indexed_manifestations_bn_by_nlp_id = {} indexed_manifestations_bn_by_titles_245 = {} indexed_manifestations_bn_by_titles_490 = {} # prepare indexes logging.info('Indexing institutions...') indexed_libs_by_mak_id, indexed_libs_by_es_id = create_lib_indexes( configuration['inst_file_in']) logging.info('DONE!') logging.info('Indexing codes and values...') indexed_code_values = code_value_indexer(configuration['code_val_file_in']) logging.info('DONE!') logging.info('Indexing descriptors...') indexed_descriptors = index_descriptors(configuration['descr_files_in']) logging.info('DONE!') # start main loop - iterate through all bib records (only books) from BN logging.info('Starting main loop...') logging.info('FRBRrization step one in progress (first loop)...') # used for limit and stats counter = 0 for bib in tqdm(read_marc_from_file(configuration['bn_file_in'])): if is_book_ebook_audiobook(bib) and is_single_work(bib) and has_items( bib) and is_245_indicator_2_valid(bib): if counter > configuration['limit']: break try: bib = resolve_record(bib, indexed_descriptors) except DescriptorNotResolved as error: logging.debug(error) continue # create stub work and get from manifestation data needed for work matching work = Work() work.get_manifestation_bn_id(bib) work.get_main_creator(bib, indexed_descriptors) work.get_other_creator(bib, indexed_descriptors) work.get_titles(bib) counter += 1 # try to match with existing work (and if there is a match: merge to one work and index by all titles) # if there is no match, index new work by titles and by uuid work.match_with_existing_work_and_index(indexed_works_by_uuid, indexed_works_by_titles) # index original bib record by bn_id - fast lookup for conversion and manifestation matching indexed_manifestations_bn_by_nlp_id.setdefault( get_values_by_field(bib, '001')[0], bib.as_marc()) # index manifestation for matching with mak+ by 245 titles and 490 titles titles_for_manif_match = get_titles_for_manifestation_matching(bib) for title in titles_for_manif_match.get('titles_245'): indexed_manifestations_bn_by_titles_245.setdefault( title, set()).add(get_values_by_field(bib, '001')[0]) for title in titles_for_manif_match.get('titles_490'): indexed_manifestations_bn_by_titles_490.setdefault( title, set()).add(get_values_by_field(bib, '001')[0]) logging.info('DONE!') if configuration['frbr_step_two']: logging.info( 'FRBRrization step two - trying to merge works using broader context (second loop)...' ) for work_uuid, indexed_work in tqdm(indexed_works_by_uuid.items()): # check if work exists, it could've been set to None earlier in case of merging more than one work at a time if indexed_work: result = indexed_work.try_to_merge_possible_duplicates_using_broader_context( indexed_works_by_uuid, indexed_works_by_titles) if result: indexed_works_by_uuid[work_uuid] = None logging.info('DONE!') logging.info('Conversion in progress...') for work_uuid, indexed_work in tqdm(indexed_works_by_uuid.items()): # do conversion, upsert expressions and instantiate manifestations and BN items if indexed_work: print(indexed_work.titles245) indexed_work.convert_to_work(indexed_manifestations_bn_by_nlp_id, configuration['buffer'], indexed_descriptors, indexed_code_values) logging.debug(f'\n{indexed_work.mock_es_id}') for expression in indexed_work.expressions_dict.values(): logging.debug(f' {expression}') for manifestation in expression.manifestations: # index works by manifestations nlp id for inserting MAK+ items indexed_works_by_mat_nlp_id.setdefault( manifestation.mat_nlp_id, indexed_work.uuid) logging.debug(f' {manifestation}') for i in manifestation.bn_items: logging.debug(f' {i}') logging.info('DONE!') if configuration['run_manif_matcher']: logging.info('MAK+ manifestation matching in progress...') list_of_files = os.listdir(configuration['mak_files_in']) # iterate through marcxml MAK+ files for file_num, filename in enumerate(list_of_files, start=1): if file_num > configuration['limit_mak']: break else: path_file = os.sep.join( [configuration['mak_files_in'], filename]) logging.info( f'Parsing MAK+ file nr {file_num} - {filename}...') parsed_xml = parse_xml_to_array(path_file) # iterate through parsed records (pymarc Records objects) for r in parsed_xml: # check if it is not None - there are some problems with parsing if r: # try to match with BN manifestation try: match = match_manifestation( r, index_245= indexed_manifestations_bn_by_titles_245, index_490= indexed_manifestations_bn_by_titles_490, index_id=indexed_manifestations_bn_by_nlp_id) except (IndexError, ValueError, TypeError) as error: # print(error) continue if match: list_ava = r.get_fields('AVA') w_uuid = indexed_works_by_mat_nlp_id.get(match) ref_to_work = indexed_works_by_uuid.get(w_uuid) # this is definitely not a best way to do it if ref_to_work: for e in ref_to_work.expressions_dict.values(): for m in e.manifestations: if m.mat_nlp_id == match: logging.debug( 'Adding mak_items...') item_counter = 0 item_add_counter = 0 for num, ava in enumerate(list_ava, start=1): try: it_to_add = MakItem( ava, indexed_libs_by_mak_id, ref_to_work, e, m, buff, num) if it_to_add.item_local_bib_id not in m.mak_items: logging.debug( f'Added new mak_item - {num}' ) m.mak_items.setdefault( it_to_add. item_local_bib_id, it_to_add) item_counter += 1 else: existing_it = m.mak_items.get( it_to_add. item_local_bib_id) existing_it.add( it_to_add) logging.debug( f'Increased item_count in existing mak_item - {num}.' ) item_add_counter += 1 except AttributeError as error: logging.debug(error) continue logging.debug( f'Added {item_counter} new mak_items, increased count {item_add_counter} times.' ) logging.info('DONE!') # loop for: # - adding mak items mock_es_ids # - serializing and writing mak items to json file # - getting libraries for manifestation # - getting mak item ids and count for manifestation # - serializing and writing manifestations to json file # - getting mak item ids and count for expression # - serializing and writing expressions to json file # - getting mak item ids and count, manifestation ids and couun, expresions ids and count for work # - serializing and writing works to json file for indexed_work in tqdm(indexed_works_by_uuid.values()): if indexed_work: logging.debug(f'\n{indexed_work.mock_es_id}') for expression in indexed_work.expressions_dict.values(): logging.debug(f' {expression}') for manifestation in expression.manifestations: for num, item in enumerate( manifestation.mak_items.values(), start=1): item.mock_es_id = f'{str(num)}{str(manifestation.mock_es_id)}' item.write_to_dump_file(buff) manifestation.get_resolve_and_serialize_libraries( indexed_libs_by_es_id) manifestation.get_mak_item_ids() manifestation.write_to_dump_file(buff) logging.debug(f' {manifestation}') #for i in manif.bn_items: #print(f' BN - {i}') #for im in manif.mak_items.values(): #print(f' MAK - {im}') expression.get_item_ids_item_count_and_libraries() expression.write_to_dump_file(buff) indexed_work.get_expr_manif_item_ids_and_counts() indexed_work.write_to_dump_file(buff) logging.debug(indexed_works_by_uuid) logging.debug(indexed_works_by_titles) logging.debug(indexed_manifestations_bn_by_nlp_id) logging.debug(indexed_manifestations_bn_by_titles_245) logging.debug(indexed_manifestations_bn_by_titles_490)
def __init__(self, bib_object, work, expression, buffer, descr_index, code_val_index): # attributes for manifestation_es_index self.mock_es_id = str( esid.MANIFESTATION_PREFIX + to_single_value(get_values_by_field(bib_object, '001'))[1:]) self.eForm = only_values( resolve_field_value( get_values_by_field_and_subfield(bib_object, ('380', ['a'])), descr_index)) self.expression_ids = [int(expression.mock_es_id)] self.item_ids = [ ] # populated after instantiating all the manifestations and mak+ matching self.libraries = [ ] # populated after instantiating all the manifestations and mak+ matching self.mat_carrier_type = resolve_code_and_serialize( get_values_by_field_and_subfield(bib_object, ('338', ['b'])), 'carrier_type_dict', code_val_index) self.mat_contributor = [] self.mat_digital = False self.mat_edition = get_values_by_field(bib_object, '250') self.mat_external_id = get_values_by_field_and_subfield( bib_object, ('035', ['a'])) self.mat_isbn = get_values_by_field_and_subfield( bib_object, ('020', ['a'])) self.mat_matching_title = '' # todo self.mat_material_type = '' # todo self.mat_media_type = resolve_code_and_serialize( get_values_by_field_and_subfield(bib_object, ('337', ['b'])), 'media_type_dict', code_val_index) self.mat_nat_bib = [] # todo self.mat_nlp_id = to_single_value( get_values_by_field(bib_object, '001')) self.mat_note = [] # todo self.mat_number_of_pages = to_single_value( get_values_by_field_and_subfield(bib_object, ('300', ['a']))) self.mat_physical_info = get_values_by_field(bib_object, '300') self.mat_pub_city = get_values_by_field_and_subfield( bib_object, ('260', ['a'])) self.mat_pub_country = [] self.get_pub_country(bib_object, code_val_index) self.mat_pub_date_from = None self.mat_pub_date_single = None self.mat_pub_date_to = None self.get_mat_pub_dates(bib_object) self.mat_pub_info = get_values_by_field(bib_object, '260') self.mat_publisher = [] self.get_publishers_all(bib_object) self.mat_publisher_uniform = [] self.get_uniform_publishers(bib_object, descr_index) self.mat_title_and_resp = get_values_by_field(bib_object, '245') self.mat_title_other_info = [] # todo self.mat_title_proper = to_single_value( postprocess( truncate_title_proper, get_values_by_field_and_subfield(bib_object, ('245', ['a', 'b'])))) self.mat_title_variant = get_values_by_field_and_subfield( bib_object, ('246', ['a', 'b'])) self.metadata_original = str(uuid4()) self.metadata_source = 'REFERENCE' self.modificationTime = "2019-10-01T13:34:23.580" self.phrase_suggest = [self.mat_title_proper] # todo self.popularity_join = "owner" self.stat_digital = False self.stat_digital_library_count = 0 self.stat_item_count = 0 self.stat_library_count = 0 self.stat_public_domain = False self.suggest = [self.mat_title_proper] # todo self.work_creator = [] self.work_creators = [] self.get_work_creators(work) self.get_mat_contributors(bib_object, code_val_index, descr_index) self.work_ids = [int(work.mock_es_id)] self.bn_items = [ self.instantiate_bn_items(bib_object, work, expression, buffer) ] self.polona_items = [ self.instantiate_polona_items(bib_object, work, expression, buffer) ] self.mak_items = {}