def z3950_query_manager(target, meta, matchpoint): """ Oversees queries send to Sierra Z3950 args: api_name meta obj matchpoint return: query result """ module_logger.debug('Making new Z3950 request to: {}'.format( target['host'])) try: result = queries.query_runner( 'Z3950', target, meta, matchpoint) return result except ConnectionError: module_logger.error('Z3950 Connection error on host {}'.format( target['host'])) raise OverloadError( 'Connection error. Unable to reach Z3950 host: {}.'.format( target)) except ValueError: module_logger.error( 'Z3950 ValueError on target parameters {}'.format( target)) raise OverloadError( 'Z3950 target not provided')
def create_sheet_for_system(system, auth, sheet_name, tabs, parent_id=None): """ creates Google Sheet of given name, with layout for NYPL report args: system: string, 'NYPL' or 'BPl' auth: class 'oauth2client.client.OAuth2Credentials' sheet_name: string, name of the spreadsheet tabs: list, names of individual sheets returns: sheet_id: string, GDrive assigned id """ sheet_id = goo.create_sheet(auth, sheet_name, tabs) # customize it if 'CallNumbers' in sheet_name: goo.customize_pvf_callNos_report(auth, sheet_id) elif 'Dups' in sheet_name and system == 'NYPL': goo.customize_nypl_pvf_dup_report(auth, sheet_id) elif 'Dups' in sheet_name and system == 'BPL': goo.customize_bpl_pvf_dup_report(auth, sheet_id) # move sheet to appropriate folder if not goo.file2folder(auth, parent_id, sheet_id): module_logger.error('Unable to move sheet {} to folder {}.'.format( sheet_id, parent_id)) raise OverloadError('Failed to move {} document to ' 'correct GDrive folder'.format(sheet_name)) return sheet_id
def save_stats(): module_logger.debug('Saving batch stats.') batch = shelve.open(BATCH_META) timestamp = batch['timestamp'] system = batch['system'] library = batch['library'] agent = batch['agent'] file_qty = len(batch['file_names']) batch.close() try: df = reports.shelf2dataframe(BATCH_STATS, system) except ValueError: df = None if df is not None: stats = reports.create_stats(system, df) with session_scope() as session: # find out if timestamp already added # if not add records # add batch record record = insert_or_ignore(session, PVR_Batch, timestamp=timestamp, system=system, library=library, agent=agent, file_qty=file_qty) session.flush() bid = record.bid for row in stats.iterrows(): name = row[1]['vendor'] record = insert_or_ignore(session, Vendor, name=name) session.flush() vid = record.vid if system == 'nypl': record = insert_or_ignore(session, PVR_File, bid=bid, vid=vid, new=row[1]['insert'], dups=row[1]['attach'], updated=row[1]['update'], mixed=row[1]['mixed'], other=row[1]['other']) else: record = insert_or_ignore(session, PVR_File, bid=bid, vid=vid, new=row[1]['insert'], dups=row[1]['attach'], updated=row[1]['update']) else: module_logger.warning( 'Unable to created dataframe from the BATCH_STATS.') raise OverloadError( 'Encountered problems while trying to save statistics.')
def store_connection(name, host, folder, user, password, system): if name == '': name = None if host == '': host = None if folder == '': folder = None if system == '': system = None if user == '': user = None else: user = base64.b64encode(user) if password == '': password = None else: password = base64.b64encode(password) try: with session_scope() as db_session: insert_or_ignore(db_session, FTPs, name=name, host=host, folder=folder, user=user, password=password, system=system) except IntegrityError as e: module_logger.error('Unable to store FTP details. Error: {}'.format(e)) raise OverloadError('Error. The name of the new connection is\n.' 'already used or some of the required elements\n' 'are missing')
def update_template(otid, record): try: with session_scope() as session: update_nypl_template(session, otid, **record) except IntegrityError as e: module_logger.error('IntegrityError on template update: {}'.format(e)) raise OverloadError('Duplicate/missing template name\n' 'or missing primary matchpoint')
def save_template(record): try: with session_scope() as session: insert_or_ignore(session, NYPLOrderTemplate, **record) except IntegrityError as e: module_logger.error('IntegrityError on template save: {}'.format(e)) raise OverloadError('Duplicate/missing template name\n' 'or missing primary matchpoint')
def connect2ftp(host, user, password): module_logger.debug('Connecting to FTP: {}.'.format(host)) try: ftp = FTP(host) conn = ftp.login(user, password) if conn[:3] == '230': module_logger.debug('Successful connection.') return ftp else: module_logger.error( 'Unsuccessful connection attempt to FTP: {}.'.format(conn)) raise OverloadError('Unable to connect to FTP.\n' 'Error: {}'.format(conn)) except all_errors as e: module_logger.error('Unable to connect to: {}. {}'.format(host, e)) raise OverloadError('Unable to connect to: {}.\n' 'Verify host and your credentials'.format(host))
def delete_connection(name, system): with session_scope() as db_session: try: delete_record(db_session, FTPs, name=name, system=system) except Exception as exc: _, _, exc_traceback = sys.exc_info() tb = format_traceback(exc, exc_traceback) module_logger.error( 'Unhandled error of deletion of FTP details. {}'.format(tb)) raise OverloadError(exc)
def store_in_vault(application, user, password): """ stores credentials in Windows Credential Locker args: applicaiton: string, name of application user: string, name of user password: string """ # check if credentials already stored and if so # delete and store updated ones try: if not get_from_vault(application, user): keyring.set_password(application, user, password) else: keyring.delete_password(application, user) keyring.set_password(application, user, password) except PasswordSetError as e: raise OverloadError(e) except PasswordDeleteError as e: raise OverloadError(e)
def sierra_export_reader(source_fh, system, progbar1, progbar2): with open(source_fh, "r") as file: reader = csv.reader(file) # skip header header = reader.next() # check if Sierra export file has a correct structure if system == "NYPL": if header != NW2SEXPORT_COLS: raise OverloadError( "Sierra Export format incorrect.\nPlease refer to help" "for more info." ) elif system == "BPL": if header != BW2SEXPORT_COLS: raise OverloadError( "Sierra Export format incorrect.\nPlease refer to help" "for more info." ) estimate_progbars_max(reader, progbar1, progbar2)
def save_report(data, outfile): # delete previous report if not remove_files([outfile]): raise OverloadError('Unable to delete previous default ' 'validation report: {}.'.format(outfile)) report = [] for k, v in data.iteritems(): report.append('\n{} - barcode dups:'.format(k)) dups = [] for f, p in v: dups.append('\tfile: {} -- record no:{}'.format(f, p)) report.append('\n'.join(sorted(dups))) if report == []: report = ['No errors found'] try: with open(outfile, 'w') as file: file.write('\n'.join(report)) except IOError as e: raise OverloadError( 'Unable to create a new default validation report. ' 'Error: {}'.format(e))
def platform_queries_manager(api_type, session, meta, matchpoint): """ Oversees queries sent to platform args: api_type session obj meta obj matchpoint return: query result """ module_logger.debug('Making new Platform request.') try: result = queries.query_runner(api_type, session, meta, matchpoint) return result except APITokenExpiredError: session.close() raise APITokenExpiredError( 'Unable to perform query. Platform token expired.') except ConnectionError as e: module_logger.error('ConnectionError while running Platform queries. ' 'Closing session and aborting processing.') session.close() raise OverloadError(e) except Timeout as e: module_logger.error('Timeout error while running Platform queries. ' 'Closing session and aborting processing.') session.close() raise OverloadError(e) except ValueError as e: session.close() module_logger.error(e) raise OverloadError(e)
def count_bibs(file): reader = read_marc21(file) bib_count = 0 try: for bib in reader: bib_count += 1 return bib_count except RecordLengthInvalid: raise OverloadError( "Attempted to process non-MARC file,\n" "or invalid MARC file: {}".format(file) ) except UnicodeDecodeError: raise OverloadError( "Character encoding error in file:\n{}\n" "Please convert character encoding to UTF-8\n" "using MARCEdit program.".format(file) ) except RecordDirectoryInvalid: raise OverloadError( "Encountered malformed MARC record directory\n" 'in file "{}".\nUse MARCEdit to identify ' "incorrect record.".format(file) )
def set_nypl_sierra_bib_default_location(library, bib): """ adds a 949 MARC tag command for setting bibliographic location args: bib: pymarc.record.Record returns: bib: pymarc.record.Record, with added command "bn=" to the "949 $a" field, the field is created if missing """ # determine correct location code if library == "branches": defloc = NBIB_DEFAULT_LOCATIONS["branches"] elif library == "research": defloc = NBIB_DEFAULT_LOCATIONS["research"] else: raise OverloadError("Invalid library argument passed: {}".format(library)) # determine if 949 already preset if sierra_command_tag(bib): for field in bib.get_fields("949"): if field.indicators == [" ", " "]: command = field["a"].strip() if "bn=" in command: # skip, already present break else: if command[-1] == ";": new_command = "{}{}".format(field["a"], "bn={};".format(defloc)) else: new_command = "{}{}".format( field["a"], ";bn={};".format(defloc) ) field["a"] = new_command break else: # command tag not preset add bib.add_field( Field( tag="949", indicators=[" ", " "], subfields=["a", "*bn={};".format(defloc)], ) ) return bib
def decrypt_file_data(key, fh): """ decrypts data in a file args: key: string, 16-bit encryption key fh: string, file handle of file to be decrypted returns: data: string """ try: with open(fh, "rb") as file: nonce, tag, ciphertext = [file.read(x) for x in (16, 16, -1)] cipher = AES.new(key, AES.MODE_EAX, nonce) data = cipher.decrypt_and_verify(ciphertext, tag) return data except ValueError as e: raise OverloadError(e)
def barcode_duplicates(batch, system): """ Verifies there are no duplicate barcodes in the batch; parses all barcodes found in list of MARC files (batch), finds duplicates, and creates a report indicating files and records that are dups args: batch : list of MARC files returns: dict of dups (key: barcode, value: tuple (file, bib position)) """ barcodes = dict() dup_barcodes = dict() if system == 'nypl': item_tag = '949' item_tag_ind = [' ', '1'] item_tag_sub = 'i' elif system == 'bpl': item_tag = '960' item_tag_ind = [' ', ' '] item_tag_sub = 'i' for fh in batch: try: reader = read_marc21(fh) pos = 0 for record in reader: pos += 1 for tag in record.get_fields(item_tag): if tag.indicators == item_tag_ind: for b in tag.get_subfields(item_tag_sub): if b in barcodes: new_value = barcodes[b] new_value.append((fh, pos)) barcodes[b] = new_value else: barcodes[b] = [(fh, pos)] except UnicodeDecodeError as e: raise OverloadError(e) for k, v in barcodes.iteritems(): if len(v) > 1: dup_barcodes[k] = v return dup_barcodes
def move2ftp(host, ftp, fh, dstfh, transfer_type): try: module_logger.debug('Uploading file to FTP: host={}, local path={}, ' 'destination fh={}, transfer type={}'.format( host, fh, dstfh, transfer_type)) if transfer_type == 'binary': ftp.storbinary('STOR {}'.format(dstfh), open(fh, 'rb')) elif transfer_type == 'ASCII': ftp.storlines('STOR {}'.format(dstfh), open(fh, 'r')) module_logger.debug('Upload successful.') except all_errors as e: module_logger.error( 'Upload to FTP failed: host={}, destination fh={}, ' 'transfer type={}. Error: {}'.format(host, dstfh, transfer_type, e)) raise OverloadError( 'Encountered error while uploading file to FTP.\nAborting.')
def move2local(host, ftp, fh, dstfh, transfer_type): try: module_logger.debug( 'Downloading file from FTP: host={}, fh={}, destination path={}, ' 'transfer type={}'.format(host, fh, dstfh, transfer_type)) if transfer_type == 'binary': with open(dstfh, 'wb') as f: ftp.retrbinary('RETR %s' % fh, lambda data: f.write(data)) elif transfer_type == 'ASCII': with open(dstfh, 'w') as f: ftp.retrlines('RETR %s' % fh, lambda data: f.write(data)) module_logger.debug('Download successful.') except all_errors as e: module_logger.error( 'Download from FTP failed: host={}, file on remote={}, ' 'destination path={}, transfer type={}. Error: {}'.format( host, fh, dstfh, transfer_type, e)) raise OverloadError('Encountered error while downloading the file.')
def launch_process( source_fh, data_source, system, library, progbar1, progbar2, process_label, hits, nohits, skipped, meet_crit_counter, fail_user_crit_counter, fail_glob_crit_counter, action, encode_level, mat_type, cat_rules, cat_source, recap_range, id_type="ISBN", api=None, ): """ work notes: 1. iterate through the source files and extract bib/order metadata 2. temporarily persist this data in local datastore 3. iterate over the batch and find best hit for each 4. persist in local store matched record as a pymarc object 5. display results (with all data needed for Sierra import) to user 5. allow user to decide what to write to final file args: source_fh: str, file path data_source: str, 'Sierra export' or 'IDs list' system: str, 'NYPL' or 'BPL' library: str, 'research' or 'branches' progbar1: tkinter widget, overall progressbar progbar2: tkinter widget, task progressbar process_label: tkinter StrinVar, current task label hits: tkinter IntVar, hits counter nohits: tkinter IntVar, failed search counter meet_crit_counter: tkinter IntVar, success match & eval counter fail_user_crit_counter: tkinter IntVar, failed user criteria counter fail_glob_crit_counter: tkinter IntVar, failed global criteria counter action: str, 'catalog' or 'upgrade' encode_level: str, 'any', ... mat_type: str, 'any', print', 'large print', 'dvd', 'bluray' cat_rules: str, 'any', 'RDA-only' cat_source: str, 'any', 'DLC' recap_range: list, uppper and lower limits of Recap numbers id_type: str, 'ISBN', 'UPC', 'ISSN', 'LCCN', 'OCLC #' api: str, name of api to be used for queries """ if mat_type == "": mat_type = None if cat_source == "": cat_source = None module_logger.debug( "Launching W2S process. " "Params: source_fh:{}, data_source:{}, system:{}, " "library:{}, action:{}, encode_level:{}, mat_type:{}, " "cat_rules:{}, cat_source:{}, recap_range:{}, id_type:{}, " "api:{}".format( source_fh, data_source, system, library, action, encode_level, mat_type, cat_rules, cat_source, recap_range, id_type, api, ) ) processed_counter = 0 found_counter = 0 not_found_counter = 0 skipped_counter = 0 remove_previous_process_data() # validate correctness of sierra export process_label.set("reading:") if data_source == "Sierra export": sierra_export_reader(source_fh, system, progbar1, progbar2) elif data_source == "IDs list": id_list_reader(source_fh, progbar1, progbar2) # keep track of recap call numbers if recap_range: recap_no = recap_range[0] else: recap_no = None with session_scope() as db_session: # create batch record batch_rec = insert_or_ignore( db_session, WCSourceBatch, file=source_fh, system=system, library=library, action=action, api=api, data_source=data_source, encode_level=encode_level, mat_type=mat_type, cat_rules=cat_rules, cat_source=cat_source, id_type=id_type, ) db_session.flush() batch_id = batch_rec.wcsbid # parse depending on the data source if data_source == "IDs list": with open(source_fh, "r") as file: reader = csv.reader(file) # skip header reader.next() if id_type == "ISBN": for row in reader: meta = BibOrderMeta( system=system, dstLibrary=library, t020=[parse_isbn(row[0])] ) insert_or_ignore( db_session, WCSourceMeta, wcsbid=batch_id, meta=meta ) update_progbar(progbar1) update_progbar(progbar2) elif id_type == "UPC": raise OverloadError("Not implemented.") pass # will be implemented later # for row in reader: # meta = BibOrderMeta( # system=system, # dstLibrary=library, # t024=[parse_upc(row[0])]) elif id_type == "OCLC #": for row in reader: meta = BibOrderMeta( system=system, dstLibrary=library, t001=row[0] ) insert_or_ignore( db_session, WCSourceMeta, wcsbid=batch_id, meta=meta ) update_progbar(progbar1) update_progbar(progbar2) else: raise OverloadError("Not implemented.") elif data_source == "Sierra export": data = sierra_export_data(source_fh, system, library) for meta, single_order in data: if single_order is None: row = ["b{}a".format(meta.sierraId), meta.title] skipped_counter += 1 skipped.set(skipped_counter) save2csv(W2S_SKIPPED_ORD, row) progbar1["maximum"] = progbar1["maximum"] - 3 elif single_order is False: row = ["b{}a".format(meta.sierraId), meta.title] skipped_counter += 1 skipped.set(skipped_counter) save2csv(W2S_MULTI_ORD, row) progbar1["maximum"] = progbar1["maximum"] - 3 else: insert_or_ignore( db_session, WCSourceMeta, wcsbid=batch_id, meta=meta ) update_progbar(progbar1) update_progbar(progbar2) creds = get_credentials(api) wskey = creds["key"] db_session.commit() # query Worldcat process_label.set("querying:") # reset progbar2 progbar2["value"] = 0 metas = retrieve_records(db_session, WCSourceMeta, wcsbid=batch_id) with SearchSession(credentials=wskey) as session: for m in metas: module_logger.debug(m.meta) hit = False if m.meta.t001: query = construct_sru_query( m.meta.t001, keyword_type="OCLC #", mat_type=mat_type, cat_source=cat_source, ) res = session.sru_query(query=query) module_logger.debug("OCLC# request: {}".format(res.url)) hit = interpret_search_response(res, db_session, m.wcsmid) if hit: found_counter += 1 if m.meta.t010 and not hit: query = construct_sru_query( m.meta.t010, keyword_type="LCCN", mat_type=mat_type, cat_source=cat_source, ) res = session.sru_query(query=query) module_logger.debug("LCCN request: {}".format(res.url)) hit = interpret_search_response(res, db_session, m.wcsmid) if hit: found_counter += 1 if m.meta.t020 and not hit: # will iterate over all ISBNs if no hits for isbn in m.meta.t020: query = construct_sru_query( isbn, keyword_type="ISBN", mat_type=mat_type, cat_source=cat_source, ) res = session.sru_query(query=query) module_logger.debug("ISBN request: {}".format(res.url)) hit = interpret_search_response(res, db_session, m.wcsmid) if hit: found_counter += 1 break # stop searching if m.meta.t024 and not hit: for upc in m.meta.t024: query = construct_sru_query( upc, keyword_type="UPC", mat_type=mat_type, cat_source=cat_source, ) res = session.sru_query(query=query) module_logger.debug("UPC request: {}".format(res.url)) hit = interpret_search_response(res, db_session, m.wcsmid) if hit: found_counter += 1 break # stop searching if not hit: not_found_counter += 1 module_logger.debug( "Unable to find any matches in Worldcat for {}.".format(m.meta) ) interpret_search_response(None, db_session, m.wcsmid) hits.set(found_counter) nohits.set(not_found_counter) update_progbar(progbar1) update_progbar(progbar2) processed_counter += 1 db_session.commit() # check if meet criteria process_label.set("analyzing:") progbar2["value"] = 0 rows = retrieve_records(db_session, WCHit, hit=True) for row in rows: results = row.query_results recs = results2record_list(results) for xml_record in recs: fulfills = False fail_types = [] if meets_upgrade_criteria(xml_record): if meets_user_criteria( xml_record, encode_level, mat_type, cat_rules, cat_source ): fulfills = True if action == "upgrade": meet_crit_counter.set(meet_crit_counter.get() + 1) oclcNo = get_oclcNo(xml_record) update_hit_record( db_session, WCHit, row.wchid, match_oclcNo=oclcNo ) update_progbar(progbar1) update_progbar(progbar2) break elif action == "catalog": if meets_catalog_criteria(xml_record, library): fulfills = True meet_crit_counter.set(meet_crit_counter.get() + 1) oclcNo = get_oclcNo(xml_record) update_hit_record( db_session, WCHit, row.wchid, match_oclcNo=oclcNo ) update_progbar(progbar1) update_progbar(progbar2) break else: fail_types.append("global") else: fail_types.append("user") else: fail_types.append("global") if not fulfills: if "user" in fail_types: fail_user_crit_counter.set(fail_user_crit_counter.get() + 1) else: fail_glob_crit_counter.set(fail_glob_crit_counter.get() + 1) db_session.commit() # download and prep process_label.set("downloading:") # reset progbar2 progbar2["value"] = 0 # obtain access token token = get_token(creds) if token.token_str is None: module_logger.error( "Worldcat token not obtained. Error: {}.".format(token.server_response) ) else: module_logger.debug("Worldcat token obtained.") # open Metadata API session with MetadataSession(credentials=token) as session: metas = retrieve_related( db_session, WCSourceMeta, "wchits", wcsbid=batch_id ) for m in metas: if m.wchits.match_oclcNo: xml_record = request_record(session, m.wchits.match_oclcNo) if xml_record is not None: update_hit_record( db_session, WCHit, m.wchits.wchid, match_marcxml=xml_record ) update_progbar(progbar1) update_progbar(progbar2) db_session.commit() # prepare MARC files process_label.set("prepping:") progbar2["value"] = 0 # check if Sierra bib # provided and use # for overlay command line rows = retrieve_records(db_session, WCSourceMeta, wcsbid=batch_id) for row in rows: # initial workflow shared by updgrade fuctionality xml_record = row.wchits.match_marcxml if xml_record is not None: marc_record = marcxml2array(xml_record)[0] remove_unsupported_subject_headings(system, marc_record) remove_unwanted_tags(marc_record) remove_ebook_isbns(marc_record) marc_record.remove_fields("901", "907", "945", "949", "947") initials = create_initials_field(system, library, "W2Sbot") marc_record.add_ordered_field(initials) if data_source == "Sierra export": order_data = row.meta if order_data.sierraId: overlay_tag = create_target_id_field( system, order_data.sierraId ) marc_record.add_ordered_field(overlay_tag) if system == "NYPL": marc_record.remove_fields("001", "910") tag_001 = nypl_oclcNo_field(xml_record) marc_record.add_ordered_field(tag_001) # add Sierra bib code 3 and default location if library == "branches": defloc = NBIB_DEFAULT_LOCATIONS["branches"] elif library == "research": defloc = NBIB_DEFAULT_LOCATIONS["research"] tag_949 = create_command_line_field("*b3=h;bn={};".format(defloc)) marc_record.add_ordered_field(tag_949) if action == "catalog": # add call number & persist if data_source == "Sierra export": order_data = row.meta local_fields = create_local_fields( xml_record, system, library, order_data=order_data, recap_no=recap_no, ) else: # data source a list of IDs local_fields = create_local_fields( xml_record, system, library, recap_no=recap_no ) if local_fields: for field in local_fields: if field is not None: marc_record.add_ordered_field(field) if system == "NYPL" and library == "research": recap_no += 1 update_hit_record( db_session, WCHit, row.wchits.wchid, prepped_marc=marc_record ) update_progbar(progbar1) update_progbar(progbar2) # make sure W2S stays within assigned Recap range if system == "NYPL" and library == "research": if action == "catalog": if recap_no > recap_range[1]: raise OverloadError( "Used all available ReCAP call numbers " "assigned for W2S." ) # show completed progbar1["value"] = progbar1["maximum"] progbar2["value"] = progbar2["maximum"]
def open_platform_session(api_name=None): """ wrapper around platform authorization and platform session obj args: api_type str api_name str return: session obj """ module_logger.debug('Preping to open Platform session.') reusing_token = False try: ud = shelve.open(USER_DATA, writeback=True) # retrieve specified Platform authorization conn_data = ud['PlatformAPIs'][api_name] client_id = base64.b64decode(conn_data['client_id']) auth_server = conn_data['oauth_server'] base_url = conn_data['host'] last_token = conn_data['last_token'] # encrypt? # retrieve secret from Windows Vault client_secret = credentials.get_from_vault(auth_server, client_id) # check if valid token exists and reuse if can if last_token is not None: if last_token.get('expires_on') < datetime.now(): # token expired, request new one module_logger.info( 'Platform token expired. Requesting new one.') auth = AuthorizeAccess(client_id, client_secret, auth_server) token = auth.get_token() else: module_logger.debug( 'Last Platform token still valid. Re-using.') reusing_token = True token = last_token else: module_logger.debug('Requesting Platform access token.') auth = AuthorizeAccess(client_id, client_secret, auth_server) token = auth.get_token() # save token for reuse if not reusing_token: module_logger.debug('Saving Platform token for reuse.') ud['PlatformAPIs'][api_name]['last_token'] = token except KeyError as e: module_logger.error( 'KeyError in user_data: api name: {}. Error msg:{}'.format( api_name, e)) raise OverloadError( 'Error parsing user_data while retrieving connection info.') except ValueError as e: module_logger.error(e) raise OverloadError(e) except APITokenError as e: module_logger.error('Platform API Token Error: {}'.format(e)) raise OverloadError(e) except ConnectionError as e: module_logger.error('Platform Connection Error: {}'.format(e)) raise OverloadError(e) except Timeout as e: module_logger.error('Platform Timeout Error: {}'.format(e)) raise OverloadError(e) finally: ud.close() # open Platform session try: module_logger.debug('Auth obtained. Opening Platform session.') session = PlatformSession(base_url, token) return session except ValueError as e: module_logger.error(e) raise OverloadError(e) except APITokenExpiredError as e: module_logger.error('Platform API token expired: {}'.format(e)) raise OverloadError(e)
def run_processing(files, system, library, agent, api_type, api_name, template, output_directory, progbar, current_process_label): """ args: template: instance of NYPLOrderTemplate class """ # agent argument is 3 letter code module_logger.debug('PVR process launched.') # tokens and sessions are opened on this level # determine destination API if api_type == 'Platform API': module_logger.debug('Creating Platform API session.') try: session = open_platform_session(api_name) except OverloadError: raise elif api_type == 'Z3950': module_logger.debug( 'retrieving Z3950 settings for {}'.format(api_name)) user_data = shelve.open(USER_DATA) target = user_data['Z3950s'][api_name] user_data.close() elif api_type == 'Sierra API': module_logger.debug('Connecting to Sierra API') # clean-up batch metadata & stats if not template: template_name = None else: template_name = template.tName module_logger.debug('Opening BATCH_META.') batch = shelve.open(BATCH_META, writeback=True) module_logger.debug('BATCH_META has been emptied from previous content.') timestamp = datetime.now() batch['timestamp'] = timestamp batch['system'] = system batch['library'] = library batch['agent'] = agent batch['template'] = template_name batch['file_names'] = files batch.close() module_logger.debug('BATCH_META new data: {}, {}, {}, {}, {}, {}'.format( timestamp, system, library, agent, template_name, files)) stats = shelve.open(BATCH_STATS, writeback=True) stats.clear() if not remove_files(BARCODES): module_logger.error( 'Unable to empty BARCODES storage at location {}'.format(BARCODES)) raise OverloadError('Unable to delete barcodes from previous batch.') else: module_logger.debug( 'BATCH_STATS has been emptied from previous content.') # determine output mrc files namehandles if agent == 'cat': date_today = date.today().strftime('%y%m%d') fh_dups = os.path.join(output_directory, '{}.DUP-0.mrc'.format(date_today)) fh_new = os.path.join(output_directory, '{}.NEW-0.mrc'.format(date_today)) # delete existing files to start over from scratch if not remove_files([fh_new, fh_dups]): module_logger.warning( 'Unable to delete PVF output files from previous batch.') raise OverloadError( 'Unable to delete output files from previous batch.') elif agent in ('sel', 'acq'): # remove mrc extention if exists tail = os.path.split(files[0])[1] if tail[-4:] == '.mrc': tail = tail[:-4] tail = '{}.PRC-0.mrc'.format(tail) fh = os.path.join(output_directory, tail) # delete existing files to start over from scratch if not remove_files([fh]): module_logger.warning( 'Unable to delete PVF output files from previous batch.') raise OverloadError( 'Unable to delete output files from previous batch.') # create reference index module_logger.debug('Creatig vendor index data for {}-{}'.format( system, agent)) if agent == 'cat': rules = './rules/cat_rules.xml' vx = vendor_index(rules, system) # wrap in exception? elif agent in ('sel', 'acq'): if system == 'nypl': query_matchpoints = dict() try: if template.match1st == 'sierra_id': query_matchpoints['primary'] = ('id', template.match1st) else: query_matchpoints['primary'] = ('tag', template.match1st) if template.match2nd is not None: if template.match2nd == 'sierra_id': query_matchpoints['secondary'] = ('id', template.match2nd) else: query_matchpoints['secondary'] = ('tag', template.match2nd) if template.match3rd is not None: if template.match3rd == 'sierra_id': query_matchpoints['tertiary'] = ('id', template.match3rd) else: query_matchpoints['tertiary'] = ('tag', template.match3rd) except NoResultFound: raise OverloadError('Unable to find template {}.\n' 'Please verify it exists.'.format( template.tName)) except AttributeError: raise OverloadError('Error while applying order template.') else: raise OverloadError( 'selection workflow for BPL not implemented yet') # run queries and results analysis for each bib in each file n = 0 f = 0 for file in files: f += 1 module_logger.debug( 'Opening new MARC reader for file: {}'.format(file)) reader = read_marc21(file) current_process_label.set('quering...') for bib in reader: n += 1 if agent == 'cat': vendor = identify_vendor(bib, vx) try: query_matchpoints = get_query_matchpoint(vendor, vx) module_logger.debug( 'Cat vendor index has following query matchpoints: ' '{} for vendor {}.'.format(query_matchpoints, vendor)) except KeyError: module_logger.critical( 'Unable to match vendor {} with data ' 'in cat vendor index'.format(vendor)) elif agent in ('sel', 'acq'): # vendor code if system == 'nypl': vendor = template.vendor if vendor is None: # do not apply but keep for stats vendor = 'UNKNOWN' if vendor == 'UNKNOWN': module_logger.debug( 'Encounted unidentified vendor in record # : {} ' 'in file {} (system={}, library={}, agent={})'.format( n, file, system, library, agent)) # determine vendor bib meta meta_in = VendorBibMeta(bib, vendor=vendor, dstLibrary=library) module_logger.info('Vendor bib meta: {}'.format(str(meta_in))) # store barcodes found in vendor files for verification module_logger.debug('Storing barcodes for verification.') with open(BARCODES, 'a') as barcodes_file: for b in meta_in.barcodes: barcodes_file.write(b + '\n') # Platform API workflow if api_type == 'Platform API': matchpoint = query_matchpoints['primary'][1] module_logger.debug( 'Using primary marchpoint: {}.'.format(matchpoint)) try: result = run_platform_queries(api_type, session, meta_in, matchpoint) except APITokenExpiredError: module_logger.info( 'Platform token expired. ' 'Requesting new one and opening new session.') session = open_platform_session(api_name) result = platform_queries_manager(api_type, session, meta_in, matchpoint) # run_patform_queries returns tuple (status, response in json) meta_out = [] if result[0] == 'hit': meta_out = platform2meta(result[1]) elif result[0] == 'nohit': # requery with alternative matchpoint if 'secondary' in query_matchpoints: matchpoint = query_matchpoints['secondary'][1] module_logger.debug( 'Using secondary marchpoint: {}.'.format( matchpoint)) # run platform request for the secondary matchpoint try: result = run_platform_queries( api_type, session, meta_in, matchpoint) except APITokenExpiredError: module_logger.info( 'Requesting new Platform token. ' 'Opening new session.') session = open_platform_session(api_name) result = run_platform_queries( api_type, session, meta_in, matchpoint) # other exceptions raised in run_platform_queries if result[0] == 'hit': meta_out = platform2meta(result[1]) elif result[0] == 'nohit': # run query for the 3rd matchpoint if 'tertiary' in query_matchpoints: matchpoint = query_matchpoints['tertiary'][1] module_logger.debug( 'Using tertiary marchpoint: {}.'.format( matchpoint)) # run platform request for the tertiary # matchpoint try: result = run_platform_queries( api_type, session, meta_in, matchpoint) except APITokenExpiredError: module_logger.info( 'Requesting new Platform token. ' 'Opening new session.') session = open_platform_session(api_name) result = run_platform_queries( api_type, session, meta_in, matchpoint) if result[0] == 'hit': meta_out = platform2meta(result[1]) elif result[0] == 'error': raise OverloadError( 'Platform server error.') elif result[0] == 'error': raise OverloadError('Platform server error.') else: module_logger.debug( 'No secondary matchpoint specified. ' 'Ending queries.') elif result[0] == 'error': raise OverloadError('Platform server error.') # queries performed via Z3950 elif api_type == 'Z3950': meta_out = [] matchpoint = query_matchpoints['primary'][1] module_logger.debug( 'Using primary marchpoint: {}.'.format(matchpoint)) status, bibs = z3950_query_manager(target, meta_in, matchpoint) if status == 'hit': meta_out = bibs2meta(bibs) elif status == 'nohit' and \ 'secondary' in query_matchpoints: matchpoint = query_matchpoints['secondary'][1] module_logger.debug( 'Using secondary matchpoint: {}'.format(matchpoint)) status, bibs = z3950_query_manager(target, meta_in, matchpoint) if status == 'hit': meta_out = bibs2meta(bibs) elif status == 'nohit' and \ 'tertiary' in query_matchpoints: matchpoint = query_matchpoints['tertiary'][1] module_logger.debug( 'Using tertiary matchpoint: {}'.format(matchpoint)) status, bibs = z3950_query_manager( target, meta_in, matchpoint) if status == 'hit': meta_out = bibs2meta(bibs) module_logger.info('Retrieved bibs meta: {}'.format(meta_out)) # queries performed via Sierra API elif api_type == 'Sierra API': module_logger.error('Sierra API is not implemented yet.') raise OverloadError('Sierra API is not implemented yet.') else: module_logger.error('Invalid api_type') raise OverloadError('Invalid api_type encountered.') if system == 'nypl': analysis = PVR_NYPLReport(agent, meta_in, meta_out) elif system == 'bpl': analysis = PVR_BPLReport(agent, meta_in, meta_out) module_logger.debug('Analyzing query results and vendor bib') analysis = analysis.to_dict() # apply patches if needed try: bib = patches.bib_patches(system, library, agent, vendor, bib) except AssertionError as e: module_logger.warning( 'Unable to patch bib. Error: {}'.format(e)) analysis['callNo_match'] = False module_logger.info('PVF analysis results: {}'.format(analysis)) # save analysis to shelf for statistical purposes stats[str(n)] = analysis # output processed records according to analysis # add Sierra bib id if matched # enforce utf-8 encoding in MARC leader bib.leader = bib.leader[:9] + 'a' + bib.leader[10:] sierra_id_present = check_sierra_id_presence(system, bib) module_logger.debug( 'Checking if vendor bib has Sierra ID provided: ' '{}'.format(sierra_id_present)) if not sierra_id_present and \ analysis['target_sierraId'] is not None: try: module_logger.info( 'Adding target Sierra id ({}) MARC field ' 'to vendor record {}.'.format( analysis['vendor_id'], analysis['target_sierraId'])) bib.add_field( create_target_id_field(system, analysis['target_sierraId'])) except ValueError as e: module_logger.error(e) raise OverloadError(e) # add fields form bib & order templates module_logger.debug( 'Adding template field(s) to the vendor record.') if agent == 'cat': templates = vx[vendor].get('bib_template') module_logger.debug('Selected CAT templates for {}: {}'.format( vendor, templates)) for catTemp in templates: # skip if present or always add if catTemp['tag'] == '949' and \ analysis['action'] == 'attach': pass elif catTemp['option'] == 'skip': if catTemp['tag'] not in bib: module_logger.debug('Field {} not present, adding ' 'from template'.format( catTemp['tag'])) new_field = create_field_from_template(catTemp) bib.add_field(new_field) else: module_logger.debug( 'Field {} found. Skipping.'.format( catTemp['tag'])) elif catTemp['option'] == 'add': module_logger.debug( 'Field {} being added without checking ' 'if already present'.format(catTemp['tag'])) new_field = create_field_from_template(catTemp) bib.add_field(new_field) elif agent in ('sel', 'acq'): # batch template details should be retrieved instead for the # whole batch = no need to pull it for each bib new_fields = [] if '960' in bib: for t960 in bib.get_fields('960'): new_field = db_template_to_960(template, t960) if new_field: new_fields.append(new_field) bib.remove_fields('960') else: new_field = db_template_to_960(template, None) if new_field: new_fields.append(new_field) # add modified fields back to record for field in new_fields: bib.add_field(field) new_fields = [] if '961' in bib: for t961 in bib.get_fields('961'): new_field = db_template_to_961(template, t961) if new_field: new_fields.append(new_field) # remove existing fields # (will be replaced by modified ones) bib.remove_fields('961') else: new_field = db_template_to_961(template, None) if new_field: new_fields.append(new_field) # add modified fields to bib for field in new_fields: bib.add_field(field) if template.bibFormat and \ not sierra_command_tag(bib) and \ agent == 'sel': new_field = db_template_to_949(template.bibFormat) bib.add_field(new_field) # it's safer for acquisition to skip command in 949 - # there are conflicts with Import Invoices load table # apply bibliographic default location to NYPL brief records if system == 'nypl' and agent == 'sel': bib = set_nypl_sierra_bib_default_location(library, bib) # append to appropirate output file if agent == 'cat': if analysis['action'] == 'attach': module_logger.debug( 'Appending vendor record to the dup file.') write_marc21(fh_dups, bib) else: module_logger.debug( 'Appending vendor record to the new file.') write_marc21(fh_new, bib) else: module_logger.debug('Appending vendor record to a prc file.') write_marc21(fh, bib) # update progbar progbar['value'] = n progbar.update() # dedup new cataloging file if agent == 'cat' and os.path.isfile(fh_new): current_process_label.set('deduping...') dups, combined_count, deduped_fh = dedup_marc_file(fh_new, progbar) batch = shelve.open(BATCH_META, writeback=True) batch['duplicate_bibs'] = '{} dups merged into {} bibs'.format( dups, combined_count) batch.close() # delete original file and rename deduped if deduped_fh is not None: try: os.remove(fh_new) os.rename(deduped_fh, fh_new) except WindowsError: raise OverloadError('Unable to manipulate deduped file') # validate intergrity of process files for cataloging files_out = [] if agent == 'cat': if os.path.isfile(fh_dups): files_out.append(fh_dups) if os.path.isfile(fh_new): files_out.append(fh_new) valid, missing_barcodes = validate_processed_files_integrity( files_out, BARCODES) module_logger.debug( 'Integrity validation: {}, missing_barcodes: {}'.format( valid, missing_barcodes)) if not valid: module_logger.error( 'Barcodes integrity error: {}'.format(missing_barcodes)) batch = shelve.open(BATCH_META, writeback=True) processing_time = datetime.now() - batch['timestamp'] module_logger.info( 'Batch processing stats: system={}, library={}, agent={}, user={}, ' 'used template={}, file count={}, files={}, record count={}, ' 'processing time={}'.format(system, library, agent, USER_NAME, template_name, f, [os.path.split(file)[1] for file in files], n, processing_time)) batch['processing_time'] = processing_time batch['processed_files'] = f batch['processed_bibs'] = n if agent == 'cat': batch['processed_integrity'] = valid batch['missing_barcodes'] = missing_barcodes batch.close() stats.close() # clean-up # close any open session if Platform or Sierra API has been used if api_type in ('Platform API', 'Sierra API') and session is not None: session.close() module_logger.debug('Closing API session.') if agent == 'cat' and not valid: raise OverloadError( 'Duplicate or missing barcodes found in processed files.')
def read_ftp_content(ftp, host): module_logger.debug( 'Accessing FTP ({}) directory & file listing'.format(host)) # create a list of directories and files ls = [] try: ftp.retrlines('LIST', ls.append) except all_errors as e: module_logger.error( 'Unable to retrieve file & directory list on FTP host {}.' 'Error: {}'.format(host, e)) raise OverloadError('Encountered error while retrieving\n' 'content of the FTP server.') # load available FTP parsing methods try: ftp_settings = open('./rules/ftp_parsing.json', 'r') fs = json.load(ftp_settings) except ValueError as e: module_logger.error( 'FTP settings JSON file malformed. Error: {}'.format(e)) raise OverloadError('Unable to access FTP parsing methods') finally: ftp_settings.close() # determine FTP server response parsing try: m = fs[host] except KeyError: module_logger.error( 'Accessing parsing info for unidentified FTP host: {}'.format( host)) raise OverloadError('Unidentified FTP host.') if m: dirs = [] files = [] try: for l in ls: if l[m['dir_mark'][1]:m['dir_mark'][2] + 1] == \ m['dir_mark'][0]: d = l[m['dir_handle']:].strip() dirs.append(d) elif l[m['file_mark'][1]:m['file_mark'][2] + 1] == \ m['file_mark'][0]: f = l[m['file_handle']:].strip() s = l[m['file_size_pos'][0]:m['file_size_pos'][1] + 1].strip() # timestamp t = l[m['file_time_pos'][0]:m['file_time_pos'][1] + 1].strip() patterns = m['time_patterns'] for p in patterns: try: t = convert2date_obj(t, p) break except ValueError: pass files.append((f, s, t)) return (dirs, files) except KeyError as e: module_logger.error( 'FTP parsing settings for {} are malformed. Error: {}'.format( host, e)) raise OverloadError('FTP parsing settings error.') except IndexError as e: module_logger.error( 'FTP parsing settigns for {} are incorrect. Error: {}'.format( host, e)) raise OverloadError('FTP parsing settings error.') else: module_logger.error( 'Unable to parse FTP response to LIST cmd on host {}'.format(host)) raise OverloadError('Unable to parse FTP response.')
def launch_process( system, library, target, id_type, action, source_fh, dst_fh, progbar, hit_counter, nohit_counter, ): """ manages retrieval of bibs or bibs numbers based on args: system: str, NYPL or BPL library: str, branches or research target: dict, keys = name, method id_type: str, one of ISBN, ISSN, LCCN, OCLC number, or UPC output: str, MARC or bib # dst_fh: str, path to destination file progbar: tkinter widget """ # temp report timestamp_start = datetime.now() try: os.remove(GETBIB_REP) except WindowsError: pass header = None # calc progbar maximum and dedup ids = [] dups = set() with open(source_fh) as source: reader = csv.reader(source) # skip header reader.next() c = 0 d = 0 for row in reader: rid = row[0].strip() if rid: c += 1 if rid in ids: d += 1 dups.add(rid) else: ids.append(rid) progbar["maximum"] = c # determine correct matchpoint based on id_type if id_type == "ISBN": matchpoint = "020" elif id_type == "ISSN": matchpoint = "022" elif id_type == "UPC": matchpoint = "024" elif id_type == "OCLC #": matchpoint = "001" else: raise OverloadError("Query by {} not yet implemented".format(id_type)) # determine destination API if target["method"] == "Platform API": module_logger.debug("Creating Platform API session.") try: session = open_platform_session(target["name"]) except OverloadError: raise elif target["method"] == "Z3950": module_logger.debug("retrieving Z3950 settings for {}".format( target["name"])) user_data = shelve.open(USER_DATA) target = user_data["Z3950s"][target["name"]] user_data.close() for i in ids: meta_in = BibOrderMeta(system=system, dstLibrary=library) # like vendor meta in PVR meta_in.dstLibrary = library if id_type == "ISBN": meta_in.t020 = [i] elif id_type == "ISSN": meta_in.t022 = [i] elif id_type == "UPC": meta_in.t024 = [i] elif id_type == "OCLC #": meta_in.t001 = i module_logger.debug(str(meta_in)) # query NYPL Platform if target["method"] == "Platform API": try: result = platform_queries_manager(target["method"], session, meta_in, matchpoint) except APITokenExpiredError: module_logger.info( "Platform token expired. " "Requesting new one and opening new session.") session = open_platform_session(target["method"]) result = platform_queries_manager(target["method"], session, meta_in, matchpoint) meta_out = [] if result[0] == "hit": hit_counter.set(hit_counter.get() + 1) meta_out = platform2meta(result[1]) elif result[0] == "nohit": nohit_counter.set(nohit_counter.get() + 1) elif target["method"] == "Z3950": meta_out = [] status, bibs = z3950_query_manager(target, meta_in, matchpoint) if status == "hit": hit_counter.set(hit_counter.get() + 1) meta_out = bibs2meta(bibs) elif status == "nohit": nohit_counter.set(nohit_counter.get() + 1) if system == "NYPL": analysis = PVR_NYPLReport("cat", meta_in, meta_out) elif system == "BPL": analysis = PVR_BPLReport("cat", meta_in, meta_out) module_logger.debug(str(analysis)) if not header: header = analysis.to_dict().keys() header.insert(0, "pos") save2csv(GETBIB_REP, header) if analysis.target_sierraId: analysis.target_sierraId = "b{}a".format(analysis.target_sierraId) row = analysis.to_dict().values() row.insert(0, progbar["value"]) save2csv(GETBIB_REP, row) progbar["value"] += 1 progbar.update() # record data about the batch timestamp_end = datetime.now() user_data = shelve.open(USER_DATA) user_data["getbib_batch"] = { "timestamp": timestamp_start, "time_elapsed": timestamp_end - timestamp_start, "total_queries": c, "target": target, "hits": hit_counter.get(), "misses": nohit_counter.get(), "dup_count": d, "dups": dups, } user_data.close() progbar["value"] = progbar["maximum"]
def validate_files(system, agent, files, marcval=False, locval=False): valid_files = True # mandatory, default validation try: dup_barcodes = default.barcode_duplicates(files, system) if dup_barcodes != {}: valid_files = False default.save_report(dup_barcodes, DVAL_REP) except OverloadError as e: module_logger.error('Unable to create default validation report. ' 'Error: {}'.format(e)) raise OverloadError(e) # MARCEdit MARC syntax validation if marcval: module_logger.debug('Running MARCEdit validation.') # make sure MARCEdit is installed on the machine val_engine = marcedit.get_engine() if val_engine is None: # display error message raise OverloadError( 'Failed to locate cmarcedit.exe or marcrules.txt\n' 'files of MARCEdit program. Unable to complete\n' 'MARC validation. Please uncheck the box if you\n' 'still want to proceed.') else: cme = val_engine[0] rules = val_engine[1] report_q = MVAL_REP overwrite = True for file in files: file_q = file success_process = marcedit.validate(cme, file_q, report_q, rules, overwrite) overwrite = False if success_process: result = marcedit.validation_check(MVAL_REP) if not result[0]: valid_files = False else: valid_files = False raise OverloadError( 'Encounted a problem with the file:\n' '{}.\nNot able to validate in MARCEdit'.format(file)) # delete previous local spec report if not remove_files([LSPEC_REP]): module_logger.error( 'Unable to delete pevious local spec validation report.') raise OverloadError( 'Unable to remove previous local spec validation report.') # local specification validation if locval: module_logger.debug('Local specs validation launch.') # define local specs rules for each system, agent, and vendor try: rules = './rules/vendor_specs.xml' specs = local_specs.local_specs(system, agent, rules) except AttributeError as e: module_logger.error('Unable to parse local specs rules.' 'Error: {}'.format(e)) raise OverloadError(e) # run the local specs validation locval_passed, report = local_specs.local_specs_validation( system, files, specs) if not locval_passed: valid_files = False # save the report to a file so the last batch is always remembered. try: with open(LSPEC_REP, 'w') as file: file.write(report) except IOError as e: module_logger.error( 'Encountered error while creating local specs validation' ' report. Error: {}'.format(e)) raise OverloadError( 'Unable to create local spec validation\nreport.') return valid_files