def main(): logger = logging.getLogger(__name__) collection = Collection.retrieve_collection() """ initialize logger for the logging file for that collection""" initialize_logger(collection.branch, collection.collection_id) logger.info("") rosetta_file_path = minidom.parse( lookup_rosetta_file(collection.digitization_path, collection.collection_id)) logger.info( "Extracting 001 (MMSID) and 093 (*)Call number) and turning into Dataframe" ) d = create_mmsid_dict(rosetta_file_path) df = pd.DataFrame(d).transpose() df.index = df.index.astype(str) df.index.name = "mmsid" file_full_path = collection.aleph_custom04_path / ( collection.collection_id + "_alma_sysno.xlsx") check_custom04_file(file_full_path) logger.info("Saving Dataframe to Excel file in the custom04 folder") df.to_excel(file_full_path)
def main(): start_time = timeit.default_timer() """ get branch and collection ID to work on and create a Collection instance """ # CMS, branch, collection_id = get_branch_colletionID() # collection = Collection(CMS, branch, collection_id) collection = retrieve_collection() """ initialize logger for the logging file for that collection""" initialize_logger(collection.branch, collection.collection_id) logger = logging.getLogger(__name__) logger.info( f"\n Starting new preprocess of {collection.collection_id}, at: {datetime.now()}" ) logger.info( f"[HEADERS] Dropping columns not in mapping for {collection.collection_id} Catalog, at: {datetime.now()}" ) field_mappers = {} field_mappers.update(collection_field_mapper) field_mappers.update(catalog_field_mapper) collection.full_catalog = drop_cols_not_in_mapping(collection.full_catalog, field_mappers) collection = clean_tables(collection) if hasattr(collection, "full_catalog"): collection.full_catalog = clean_catalog(collection.full_catalog) logger.info( f"[HEADERS] Checking that mandatory columns exists in table for {collection.collection_id}(full_catalog)." ) if "FIRST_CREATOR_PERS" in list(collection.full_catalog.columns): check_mandatory_cols_v2(collection.full_catalog.reset_index()) elif "COMBINED_CREATORS" in list(collection.full_catalog.columns): check_mandatory_cols_v1(collection.full_catalog.reset_index()) elif "ADD_CREATORS" in list(collection.full_catalog.columns): collection.full_catalog = split_creators_by_type( collection.full_catalog, "ADD_CREATORS") logger.info( f"[LEVEL] Mapping Level values of {collection.collection_id} from hebrew to english." ) collection.full_catalog = map_level_to_eng(collection.full_catalog) logger.info( f"[UNITID] checking for duplicate values, at: {datetime.now()}") collection.full_catalog = check_unitid(collection.full_catalog) assert collection.full_catalog.index.name == "UNITID" collection.full_catalog = create_ROOT_id(collection.full_catalog) check_missing_rootids(collection) logger.info(f"[ACCESSRESTRICT] cheecking columns values") collection.full_catalog = check_values_against_cvoc( collection.full_catalog, "ACCESSRESTRICT", Authority_instance.privacy_search_dict, ) logger.info( f"[ACCESSRESTRICT] add default value for ACCESSRESTIRCT field for missing values" ) collection.full_catalog = fill_default_ACCESSRESTIRCT( collection.full_catalog) logger.info(f"[PUBLICATION_COUNTRY] cheecking columns values") collection.full_catalog = check_values_against_cvoc( collection.full_catalog, "PUBLICATION_COUNTRY", Authority_instance.countries_mapping_dict) if "TO_DELETE" in list(collection.full_catalog.columns): logger.info( "[TO_DELETE] Changing the ROOTID to collectionID for records which are about to be deleted" ) collection.full_catalog.loc[collection.full_catalog["TO_DELETE"] == "כן", "ROOTID"] = collection.collection_id logger.info("[UNITITLE] Cleaning records title") collection.full_catalog = clean_record_title(collection.full_catalog) logger.info("[DATES] Adding normal date to Section Record") collection.full_catalog = add_normal_dates_to_section_record( collection.full_catalog, collection.collection_id) if "final" in collection.google_sheet_file_name.lower(): logger.info("[DATES] Validating dates") collection.full_catalog = check_date_columns(collection.full_catalog) logger.info("[DATES] cleaning dates - start date") collection.full_catalog["DATE_START"] = ( collection.full_catalog["DATE_START"].astype(str).replace( r"\.0$", "", regex=True).apply(clean_date_format)) logger.info("[DATES] cleaning dates - end date") collection.full_catalog["DATE_END"] = ( collection.full_catalog["DATE_END"].astype(str).replace( r"\.0$", "", regex=True).apply(clean_date_format)) logger.info( f"[COMBINED_CREATORS] CREATING COMBINED CREATORS for {collection.collection_id} , at: {datetime.now()}" ) collection = clean_creators(collection) logger.info( f"[COMBINED_CREATORS] Splitting COMBINED_CREATORS into COMBINED_CREATORS_PERS" f"and COMBINED_CREATORS_CORPS columns according to roles") collection.full_catalog = split_creators_by_type(collection.full_catalog, "COMBINED_CREATORS") collection = create_authorities_report(collection, "PERS") collection = create_authorities_report(collection, "CORPS") collection = create_authorities_report(collection, "GEO") collection = create_authorities_report(collection, "WORKS") logger.info( f"[ARCHIVAL_MATERIAL] Starting to work on ARCHIVAL_MATERIAL column") collection.full_catalog = check_values_against_cvoc( collection.full_catalog, "ARCHIVAL_MATERIAL", Authority_instance.arch_mat_search_dict, ) logger.info(f"[ARCHIVAL_MATERIAL] Creating Archival Material Match File") create_match_file( collection, Authority_instance.df_arch_mat_auth, authority_Excelfile(collection.full_catalog, "ARCHIVAL_MATERIAL"), "ARCHIVAL_MATERIAL", ) if "MEDIUM_FORMAT" in list(collection.full_catalog.columns): logger.info( f"[MEDIUM_FORMAT] Starting to work on MEDIUM_FORMAT column") collection.full_catalog = check_values_against_cvoc( collection.full_catalog, "MEDIUM_FORMAT", Authority_instance.media_format_mapping_dict, ) logger.info(f"[MEDIUM_FORMAT] Creating Media/Format Match File") create_match_file( collection, Authority_instance.df_media_format_auth, authority_Excelfile(collection.full_catalog, "MEDIUM_FORMAT"), "MEDIUM_FORMAT", ) else: logger.error( f"[MEDIUM_FORMAT] no columns names [MEDIUM_FORMAT] in table!") # logger.info(f"[DATE_CATALOGING] Checking and Validating DATE_CATALOGING column") # collection.full_catalog = check_cataloging_date(collection.full_catalog) logger.info("Final file: creating final file") collection = create_final_file(collection) collection.full_catalog, df_alma, df_missing_records_in_alma = get_alma_sid( collection.aleph_custom04_path, collection.collection_id, collection.full_catalog, ) if df_missing_records_in_alma is not None: logger.error( "Not all records have MMS ID - please create alma records for missing MMS IDs!" f'[missing MMSID] please import missing records in catalog to Alma' ) collection.missing_records = drop_col_if_exists( df_missing_records_in_alma.reset_index().set_index("סימול"), "001") update_df_in_gdrive(collection, worksheet_name="מספרי מערכת חסרים", copy=False) else: logger.info( f"Adding the Number of digitization files - as a column to the table" ) collection = add_number_of_files(collection) logger.info( f"updating the preprocessed DataFrame in Google Sheets - " f"as final copy: {collection.collection_id}_Final_to_alma_{collection.dt_now}" ) update_df_in_gdrive(collection, copy=True) collection.temp_preprocess_file() export_entire_catalog(collection, collection.full_catalog, stage="FINAL") elapsed = timeit.default_timer() - start_time logger.info(f"Execution Time: {elapsed}") # collection = add_MMSIDs_to_full_catalog(collection) collection.temp_preprocess_file(stage="PRE")
def test___init__(self): initialize_logger("VC-Architect", "PAlAv") logging.info("bla bla")
def main(): start_time = timeit.default_timer() collection = retrieve_collection() """ initialize logger for the logging file for that collection""" initialize_logger(collection.branch, collection.collection_id) logger = logging.getLogger(__name__) logger.info( f"\n Starting new preprocess of {collection.collection_id}, at: {datetime.now()}" ) logger.info( f'\nStarting new preprocess {"/".join(str(sys.modules[__name__])[:-1].split("/")[-3:])} of ' f"{collection.collection_id}, at: {datetime.now()}") time.sleep(0.5) if is_collection_postprocess1(collection): pass else: logger.error( f"The {collection.collection_id} Catalog did not pass the preprocessing-1 pipe!" f"please run preprocess_1.py for this Catalog ") sys.exit() df = collection.df_final_data.T.drop_duplicates().T df.rename(columns={df.columns[0]: "mms_id"}, inplace=True) # create 351 (רמת תיאור) logger.info(f"[351] Creating MARC 351 - LEVEL OF DESCRIPTION") collection.df_final_data = marc.create_MARC_351_LDR( collection.df_final_data) # create MARC 911 and 093 field for Call Number (סימול פרויקט) logger.info("[911/093] Creating 911/093 MARC field for Call Number") collection.df_final_data = marc.create_MARC_093(collection.df_final_data, collection.collection_id) collection.df_final_data.index = collection.df_final_data["093_1"].apply( lambda x: x[x.find("$$c") + 3:x.find("$$d")]) # # Add MMS id to catalog (מספר מערכת עלמא) # logger.info("[001] Add MMS id to catalog") # collection.df_final_data = drop_col_if_exists(collection.df_final_data, 'mms_id') # collection.df_final_data, df_alma = project.get_alma_sid( # collection.aleph_custom04_path, # collection.collection_id, # collection.df_final_data, # ) # create 008 logger.info(f"[008] Creating MARC 008 field") collection.df_final_data = marc.create_MARC_initial_008( collection.df_final_data) # create 520 (תיאור) logger.info(f"[520] Creating MARC 520 - SCOPE AND CONTENT") collection.df_final_data = marc.create_MARC_520(collection.df_final_data) # create 245 (כותרת) logger.info(f"[245] Creating MARC 245 - UNITITLE") collection.df_final_data = marc.create_MARC_245(collection.df_final_data) # create 110 and 100 (FIRST CREATORS CORPS and PERS) (יוצר ראשון - איש/ יוצר ראשון = מוסד) collection.df_final_data = marc.create_MARC_100_110( collection.df_final_data) # create 300 (EXTENT) (היקף) logger.info("[MARC 300] Creating MARC 300 - EXTENT ") collection.df_final_data = marc.create_MARC_300(collection.df_final_data) # create 700 and 710 (added creators PERS and CORPS) (יוצרים נוספים - אישים/יוצרים נוספים - מוסד) collection.df_final_data = marc.create_MARC_700_710( collection.df_final_data) # create 655 (ARCHIVAL_MATERIAL) (סוג חומר) logger.info("[MARC 655] Creating MARC 655 - ARCHIVAL MATERIAL ") collection.df_final_data = marc.create_marc_655(collection.df_final_data) # create 041 (LANGUAGE) (שפה) logger.info("[MARC 041] Creating MARC 041 - LANGUAGE") collection.df_final_data = marc.create_MARC_041(collection.df_final_data) #################################################### ### CREATE COPYRIGHT FIELDS WITH DEFAULT VALUES ### ### fields: 939, 903, 952 #################################################### logger.info("[MARC 939, 903, 952] Creating default copyright fields") collection.df_final_data = marc.create_MARC_defualt_copyright( collection.df_final_data) # create 255 - scale field logger.info("[MARC 255] Creating MARC 255 - SCALE") collection.df_final_data = marc.create_MARC_255(collection.df_final_data) # create 260 (DATE fields, and PUBLICATION_COUNTRY) (מדינת פרסום, תאריך מנורמל מוקדם, תאריך מנורמל מאוחר) logger.info( "[MARC 260] Creating MARC 260 $g $e - DATE (free text), and publication country." " Updates MARC 008") collection.df_final_data = marc.create_MARC_260( collection.df_final_data, "מדינת הפרסום/הצילום", ["תאריך מנורמל מוקדם", "תאריך מנורמל מאוחר", "תאריך חופשי"], ) logger.info("[MARC 952] Creating MARC 952 - Privacy") collection.df_final_data = marc.create_MARC_952(collection.df_final_data) # add 597 (CREDIT) collection = marc.add_MARC_597(collection) logger.info("[MARC 597] Creating MARC 597 - CREDITS") # create 921, 933 (CATALOGUER, CATALOGING DATE) logger.info( "[MARC 921/933] Creating MARC 921/933 - CATALOGUERS and CATALOGUING DATE" ) collection.df_final_data = marc.create_MARC_921_933( collection.df_final_data) # create 500 (NOTES) and other fields: logger.info("[MARC 500] Creating MARC 500 - NOTES") collection.df_final_data = marc.create_MARC_500(collection.df_final_data) collection.df_final_data = marc.create_MARC_500s_4collection( collection.df_final_data) # create 999 (Default values: NOULI, NOOCLC, ARCHIVE) logger.info( "[MARC 999] initializing MARC 999 with constant values: NOULI, NOOCLC, ARCHIVE" ) collection.df_final_data = marc.create_MARC_999(collection.df_final_data) # create BAS=VIS, in alma BAS -> 906 logger.info("[MARC 906] Adding BAS = VIS - in Alma 906") collection.df_final_data = marc.create_MARC_BAS(collection.df_final_data) # create FMT collection.df_final_data = marc.create_MARC_FMT(collection.df_final_data) # create OWN (Default value: NNL) logger.info( "[MARC 948] initializing MARC 948 - formerly Own with constant values: NNL" ) collection.df_final_data = marc.create_MARC_948(collection.df_final_data) # create 773 (former LKR) logger.info("[MARC 773] Creating MARC 773 - the hierarchical link field") collection.df_final_data = marc.create_MARC_773(collection.df_final_data) # create 336 logger.info("[MARC 336] Creating MARC RDA 336 ") collection.df_final_data, df_explode_336 = marc.create_MARC_336( collection.df_final_data) df2 = pd.concat([collection.df_final_data, df_explode_336], axis=1) # create 337 338 logger.info("[MARC 337/338] Creating MARC RDA 337/338 ") collection.df_final_data = marc.create_MARC_337_338( collection.df_final_data) # create 534 logger.info("[MARC 534] Creating MARC 534 - MEDIA FORMAT ") collection.df_final_data = marc.create_MARC_534(collection.df_final_data) # create MARC 590 logger.info("[MARC 590] Creating MARC 590 - HIDDEN NOTES") collection.df_final_data = marc.create_MARC_590(collection.df_final_data) # create MARC 561 logger.info( "[MARC 561] Creating MARC 561 - Ownership and Custodial History") collection.df_final_data = marc.create_MARC_561(collection.df_final_data) collection.temp_preprocess_file(stage="POST") # ADD 907 (#Rossetta link) logger.info( "[MARC 907] Recreating MARC 907 - adding the Rossetta field, link to the digital object (if exists)" ) collection = marc.add_MARC_907(collection) # recreate 035 MARC field from the ROS\[collection_id]_907.xml file logger.info( "[MARC 035] Recreating MARC 035 - for records which are migrated from Aleph" ) collection = marc.add_MARC_035(collection) # create MARC 650 for project branches logger.info( "[MARC 650] create MARC 650 subject heading according to collection's branch" ) collection = marc.create_MARC_650_branch(collection) # create MARC Catalog marc.export_MARCXML_final_table(collection) collection.create_marc_seq_file() # ############################################### ### export final dataframe to check process ### ############################################### # collection.temp_preprocess_file(stage="POST") ############################################### ### how much time the process ran? ### ############################################### elapsed = timeit.default_timer() - start_time logger.info(f"Execution Time: {elapsed}")