def load_message_table(): ret_val = None fname = "get_user_message" ocd = opasCentralDBLib.opasCentralDB() ocd.open_connection(caller_name=fname) # make sure connection is open sql = f"SELECT * from vw_api_messages;" if ocd.db is not None: with closing(ocd.db.cursor(buffered=True, dictionary=True)) as curs: try: curs.execute(sql) warnings = curs.fetchwarnings() if warnings: for warning in warnings: logger.warning(warning) if curs.rowcount >= 1: try: ret_val = curs.fetchall() except Exception as e: print(f"Error: {e}") ret_val = None except Exception as e: print(f"Error: {e}") ret_val = None ocd.close_connection(caller_name=fname) return ret_val
def main(): # open databases try: stage_ocd = opasCentralDBLib.opasCentralDB(host=STAGE_DB_HOST, password=STAGE_PW, user=STAGE_USER) production_ocd = opasCentralDBLib.opasCentralDB(host=PRODUCTION_DB_HOST, password=PROD_PW, user=PROD_USER) except Exception as e: logger.error(f"Cannot open stage or production databases: {e}. Terminating without changes") else: client_id = "2" if not stage_ocd.open_connection(caller_name="opasPushSettings"): raise Exception(f"Stage DB Connection Error: {STAGE_DB_HOST}") if not production_ocd.open_connection(caller_name="opasPushSettings"): raise Exception(f"Production DB Connection Error: {PRODUCTION_DB_HOST}") for config in CLIENT_CONFIGS: logger.info(f"Reading '{config}' from Staging DB") client_config = stage_ocd.get_client_config(client_id=client_id, client_config_name=config) # get old production config as backup production_config = production_ocd.get_client_config(client_id=client_id, client_config_name=config) if client_config is not None: curr_client_config_list_item = client_config.configList[0] client_config_settings = curr_client_config_list_item.configSettings logger.info(f"{config}: {client_config_settings}") if options.preview_bool: print(f"Preview mode...when non-preview it would copy `{config}` to Production DB") else: try: print(f"Replacing prod {config}:{production_config.configList[0].configSettings}") production_ocd.save_client_config_item(session_id=curr_client_config_list_item.session_id, client_id=curr_client_config_list_item.api_client_id, client_configuration_item=curr_client_config_list_item, replace=True) except Exception as e: logger.error(f"Cannot write to production database {e}. Terminating without changes") else: logger.info(f"Wrote `{config}` to Production DB ({PRODUCTION_DB_HOST})") else: logger.error(f"Cannot find settings for {config} for client: {client_id}.") stage_ocd.close_connection(caller_name="opasPushSettings") production_ocd.close_connection(caller_name="opasPushSettings") print ("Run terminating")
def test_opasdb_getsources(self): ocd = opasCentralDB() sources = ocd.get_sources() assert (sources[0] > 100) sources = ocd.get_sources(src_code="IJP") assert (sources[0] == 1) sources = ocd.get_sources(src_type="journal") assert (sources[0] > 70) sources = ocd.get_sources(src_type="videos") assert (sources[0] >= 12)
def test_opasdb_getsourcetypes(self): ocd = opasCentralDB() sources = ocd.get_sources() assert (sources[0] > 100) # unlike the API higher level function, both of these src_types--videos and stream--return streams on this direct call # because the database vw_api_productbase_instance_counts view only has the stream information. # see testAPIMetadata.test_3B_meta_videostreams to see the difference which is driven by # query parameter streams sources = ocd.get_sources(src_type="videos") assert (sources[0] >= 12) sources = ocd.get_sources(src_type="stream") assert (sources[0] >= 12)
def test_1_testviews(self): ocd = opasCentralDBLib.opasCentralDB() dbok = ocd.open_connection( caller_name="test_views") # make sure connection is open assert (dbok == True) tables = [ "vw_active_sessions", "vw_api_productbase", "vw_api_productbase_instance_counts", "vw_api_sourceinfodb", "vw_api_volume_limits", "vw_instance_counts_books", "vw_instance_counts_src", "vw_latest_session_activity", "vw_reports_document_activity", "vw_reports_document_views", "vw_reports_session_activity", "vw_reports_user_searches", "vw_stat_cited_crosstab", "vw_stat_cited_crosstab_with_details", "vw_stat_cited_in_all_years", "vw_stat_cited_in_last_5_years", "vw_stat_cited_in_last_10_years", "vw_stat_cited_in_last_20_years", "vw_stat_docviews_crosstab", "vw_stat_docviews_last12months", "vw_stat_docviews_lastmonth", "vw_stat_docviews_lastsixmonths", "vw_stat_docviews_lastweek", "vw_stat_most_viewed", "vw_stat_to_update_solr_docviews", # "vw_stat_docviews_lastcalyear" # for now, nothing from last year ] for table in tables: curs = ocd.db.cursor(buffered=True, dictionary=True) sql = f"SELECT * from {table} LIMIT 10;" try: curs.execute(sql) row_count = curs.rowcount print(f"Found {row_count} rows (limit was 10)") sourceData = curs.fetchall() assert (len(sourceData) >= 1) except: print(f"Exception: can't query table {table}") assert (False) ocd.close_connection( caller_name="test_views") # make sure connection is closed
def test_count_open_sessions(self): ocd = opasCentralDB() count = ocd.count_open_sessions() assert (count > 0)
def test_0_art_year(self): ocd = opasCentralDB() year = ocd.get_article_year("FD.026.0007A") assert (year == 2020)
#!/usr/bin/env python # -*- coding: utf-8 -*- import unittest import opasXMLHelper import opasCentralDBLib from unitTestConfig import base_api, base_plus_endpoint_encoded ocd = opasCentralDBLib.opasCentralDB() class TestStandalonePageExtractFunctions(unittest.TestCase): """ Tests Note: tests are performed in alphabetical order, hence the function naming with forced order in the names. """ def test_0a_get_article_first_page_data_html_INCOMPLETE(self): """ Retrieve an article; extract first page """ # This old function wasn't used by the code otherwise so removed this call # it retrieves an article but doesn't include search highlighting. # data = opasAPISupportLib.get_article_data("ANIJP-DE.009.0189A", fields=None) # this newer function includes the search parameters if there were some xmlstr = opasXMLHelper.xml_file_to_xmlstr(r"X:\_PEPA1\_PEPa1v\_PEPArchive\AOP\014\AOP.014.0125A(bEXP_ARCH1).XML", dtd_validations=False) htmlpages = opasXMLHelper.xml_get_pages_html(xmlstr, offset=0, limit=1, inside="div[@id='body']", env="body", pagebrk="div[@class='pagebreak']", pagenbr="p[@class='pagenumber']") extract_size = len(htmlpages) orig_size = len(xmlstr)
def end_test_session(header): from opasCentralDBLib import opasCentralDB ocd = opasCentralDB() ocd.end_session(session_id=headers["client-session"]) print("Session End")
def test_0_min_max_from_db(self): ocd = opasCentralDB() ret_val = ocd.get_min_max_volumes("FD") print (ret_val)
def main(): global options # so the information can be used in support functions cumulative_file_time_start = time.time() randomizer_seed = None # scriptSourcePath = os.path.dirname(os.path.realpath(__file__)) processed_files_count = 0 ocd = opasCentralDBLib.opasCentralDB() fs = opasFileSupport.FlexFileSystem(key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root="pep-web-xml") # set toplevel logger to specified loglevel logger = logging.getLogger() logger.setLevel(options.logLevel) # get local logger logger = logging.getLogger(programNameShort) logger.info('Started at %s', datetime.today().strftime('%Y-%m-%d %H:%M:%S"')) # logging.basicConfig(filename=logFilename, level=options.logLevel) solrurl_docs = None #solrurl_refs = None solrurl_authors = None solrurl_glossary = None if options.rootFolder == localsecrets.XML_ORIGINALS_PATH or options.rootFolder == None: start_folder = pathlib.Path(localsecrets.XML_ORIGINALS_PATH) else: start_folder = pathlib.Path(options.rootFolder) if 1: # (options.biblio_update or options.fulltext_core_update or options.glossary_core_update) == True: try: solrurl_docs = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_DOCS # e.g., http://localhost:8983/solr/ + pepwebdocs' solrurl_authors = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_AUTHORS solrurl_glossary = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_GLOSSARY # print("Logfile: ", logFilename) print("Messaging verbose: ", options.display_verbose) print("Input data Root: ", start_folder) print("Input data Subfolder: ", options.subFolder) print("Reset Core Data: ", options.resetCoreData) if options.forceRebuildAllFiles == True: msg = "Forced Rebuild - All files added, regardless of whether they are the same as in Solr." logger.info(msg) print(msg) print(80 * "*") print(f"Database will be updated. Location: {localsecrets.DBHOST}") if not options.glossary_only: # options.fulltext_core_update: print("Solr Full-Text Core will be updated: ", solrurl_docs) print("Solr Authors Core will be updated: ", solrurl_authors) if 1: # options.glossary_core_update: print("Solr Glossary Core will be updated: ", solrurl_glossary) print(80 * "*") if options.include_paras: print( "--includeparas option selected. Each paragraph will also be stored individually for *Docs* core. Increases core size markedly!" ) else: try: print( f"Paragraphs only stored for sources indicated in loaderConfig. Currently: [{', '.join(loaderConfig.src_codes_to_include_paras)}]" ) except: print( "Paragraphs only stored for sources indicated in loaderConfig." ) if options.halfway: print( "--halfway option selected. Processing approximately one-half of the files that match." ) if options.run_in_reverse: print( "--reverse option selected. Running the files found in reverse order." ) if options.file_key: print( f"--key supplied. Running for files matching the article id {options.file_key}" ) print(80 * "*") if not options.no_check: cont = input( "The above databases will be updated. Do you want to continue (y/n)?" ) if cont.lower() == "n": print("User requested exit. No data changed.") sys.exit(0) except Exception as e: msg = f"cores specification error ({e})." print((len(msg) * "-")) print(msg) print((len(msg) * "-")) sys.exit(0) # import data about the PEP codes for journals and books. # Codes are like APA, PAH, ... and special codes like ZBK000 for a particular book sourceDB = opasCentralDBLib.SourceInfoDB() solr_docs2 = None # The connection call is to solrpy (import was just solr) if localsecrets.SOLRUSER is not None and localsecrets.SOLRPW is not None: if 1: # options.fulltext_core_update: solr_docs2 = pysolr.Solr(solrurl_docs, auth=(localsecrets.SOLRUSER, localsecrets.SOLRPW)) else: # no user and password needed solr_docs2 = pysolr.Solr(solrurl_docs) # Reset core's data if requested (mainly for early development) if options.resetCoreData: if not options.glossary_only: # options.fulltext_core_update: msg = "*** Deleting all data from the docs and author cores and database tables ***" logger.warning(msg) print(msg) msg2 = "Biblio and Articles table contents will be reset" logger.info(msg2) print(msg2) ocd.delete_all_article_data() solr_docs2.delete(q='*:*') solr_docs2.commit() solr_authors2.delete(q="*:*") solr_authors2.commit() # reset glossary core when others are reset, or when --resetcore is selected with --glossaryonly if 1: # options.glossary_core_update: msg = "*** Deleting all data from the Glossary core ***" logger.warning(msg) print(msg) solr_gloss2.delete(q="*:*") solr_gloss2.commit() else: # check for missing files and delete them from the core, since we didn't empty the core above pass # Go through a set of XML files bib_total_reference_count = 0 # zero this here, it's checked at the end whether references are processed or not # ######################################################################## # Get list of files to process # ######################################################################## new_files = 0 total_files = 0 if options.subFolder is not None: start_folder = start_folder / pathlib.Path(options.subFolder) print( f"Locating files for processing at {start_folder} with pattern {loaderConfig.file_match_pattern}. Started at ({time.ctime()})." ) if options.file_key is not None: # print (f"File Key Specified: {options.file_key}") pat = fr"({options.file_key}.*){loaderConfig.file_match_pattern}" filenames = fs.get_matching_filelist(filespec_regex=pat, path=start_folder, max_items=1000) if len(filenames) is None: msg = f"File {pat} not found. Exiting." logger.warning(msg) print(msg) exit(0) else: options.forceRebuildAllFiles = True elif options.file_only is not None: fileinfo = FileInfo() filespec = options.file_only fileinfo.mapLocalFS(filespec) filenames = [fileinfo] else: pat = fr"(.*?){loaderConfig.file_match_pattern}" filenames = [] if filenames != []: total_files = len(filenames) new_files = len(filenames) else: # get a list of all the XML files that are new if options.forceRebuildAllFiles: # get a complete list of filenames for start_folder tree filenames = fs.get_matching_filelist(filespec_regex=pat, path=start_folder) else: filenames = fs.get_matching_filelist( filespec_regex=pat, path=start_folder, revised_after_date=options.created_after) print((80 * "-")) files_found = len(filenames) if options.forceRebuildAllFiles: #maybe do this only during core resets? #print ("Clearing database tables...") #ocd.delete_all_article_data() print( f"Ready to import records from {files_found} files at path {start_folder}" ) else: print( f"Ready to import {files_found} files *if modified* at path: {start_folder}" ) timeStart = time.time() print(f"Processing started at ({time.ctime()}).") print((80 * "-")) precommit_file_count = 0 skipped_files = 0 stop_after = 0 cumulative_file_time_start = time.time() issue_updates = {} if files_found > 0: if options.halfway: stop_after = round(files_found / 2) + 5 # go a bit further if options.run_in_reverse: filenames.reverse() # ---------------------------------------------------------------------- # Now walk through all the filenames selected # ---------------------------------------------------------------------- print(f"Load process started ({time.ctime()}). Examining files.") for n in filenames: fileTimeStart = time.time() file_updated = False if not options.forceRebuildAllFiles: if not options.display_verbose and processed_files_count % 100 == 0 and processed_files_count != 0: print( f"Processed Files ...loaded {processed_files_count} out of {files_found} possible." ) if not options.display_verbose and skipped_files % 100 == 0 and skipped_files != 0: print( f"Skipped {skipped_files} so far...loaded {processed_files_count} out of {files_found} possible." ) if file_is_same_as_in_solr(solr_docs2, filename=n.basename, timestamp_str=n.timestamp_str): skipped_files += 1 if options.display_verbose: print(f"Skipped - No refresh needed for {n.basename}") continue else: file_updated = True # get mod date/time, filesize, etc. for mysql database insert/update processed_files_count += 1 if stop_after > 0: if processed_files_count > stop_after: print( f"Halfway mark reached on file list ({stop_after})...file processing stopped per halfway option" ) break fileXMLContents = fs.get_file_contents(n.filespec) # get file basename without build (which is in paren) base = n.basename artID = os.path.splitext(base)[0] # watch out for comments in file name, like: # JICAP.018.0307A updated but no page breaks (bEXP_ARCH1).XML # so skip data after a space m = re.match(r"([^ ]*).*\(.*\)", artID) # Note: We could also get the artID from the XML, but since it's also important # the file names are correct, we'll do it here. Also, it "could" have been left out # of the artinfo (attribute), whereas the filename is always there. artID = m.group(1) # all IDs to upper case. artID = artID.upper() msg = "Processing file #%s of %s: %s (%s bytes). Art-ID:%s" % ( processed_files_count, files_found, base, n.filesize, artID) logger.info(msg) if options.display_verbose: print(msg) # import into lxml parser = lxml.etree.XMLParser(encoding='utf-8', recover=True, resolve_entities=True, load_dtd=True) root = etree.fromstring( opasxmllib.remove_encoding_string(fileXMLContents), parser) pepxml = root # save common document (article) field values into artInfo instance for both databases artInfo = opasSolrLoadSupport.ArticleInfo(sourceDB.sourceData, pepxml, artID, logger) artInfo.filedatetime = n.timestamp_str artInfo.filename = base artInfo.file_size = n.filesize artInfo.file_updated = file_updated # not a new journal, see if it's a new article. if opasSolrLoadSupport.add_to_tracker_table( ocd, artInfo.art_id): # if true, added successfully, so new! # don't log to issue updates for journals that are new sources added during the annual update if artInfo.src_code not in loaderConfig.DATA_UPDATE_PREPUBLICATION_CODES_TO_IGNORE: art = f"<article id='{artInfo.art_id}'>{artInfo.art_citeas_xml}</article>" try: issue_updates[artInfo.issue_id_str].append(art) except Exception as e: issue_updates[artInfo.issue_id_str] = [art] try: artInfo.file_classification = re.search( "(?P<class>current|archive|future|free|special|offsite)", str(n.filespec), re.IGNORECASE).group("class") # set it to lowercase for ease of matching later if artInfo.file_classification is not None: artInfo.file_classification = artInfo.file_classification.lower( ) except Exception as e: logger.warning( "Could not determine file classification for %s (%s)" % (n.filespec, e)) # walk through bib section and add to refs core database precommit_file_count += 1 if precommit_file_count > configLib.opasCoreConfig.COMMITLIMIT: print( f"Committing info for {configLib.opasCoreConfig.COMMITLIMIT} documents/articles" ) # input to the glossary if 1: # options.glossary_core_update: # load the glossary core if this is a glossary item glossary_file_pattern = r"ZBK.069(.*)\(bEXP_ARCH1\)\.(xml|XML)$" if re.match(glossary_file_pattern, n.basename): opasSolrLoadSupport.process_article_for_glossary_core( pepxml, artInfo, solr_gloss2, fileXMLContents, verbose=options.display_verbose) # input to the full-text and authors cores if not options.glossary_only: # options.fulltext_core_update: # load the docs (pepwebdocs) core opasSolrLoadSupport.process_article_for_doc_core( pepxml, artInfo, solr_docs2, fileXMLContents, include_paras=options.include_paras, verbose=options.display_verbose) # load the authors (pepwebauthors) core. opasSolrLoadSupport.process_info_for_author_core( pepxml, artInfo, solr_authors2, verbose=options.display_verbose) # load the database opasSolrLoadSupport.add_article_to_api_articles_table( ocd, artInfo, verbose=options.display_verbose) if precommit_file_count > configLib.opasCoreConfig.COMMITLIMIT: precommit_file_count = 0 solr_docs2.commit() solr_authors2.commit() # input to the references core if 1: # options.biblio_update: if artInfo.ref_count > 0: bibReferences = pepxml.xpath( "/pepkbd3//be" ) # this is the second time we do this (also in artinfo, but not sure or which is better per space vs time considerations) if options.display_verbose: print(( " ...Processing %s references for the references database." % (artInfo.ref_count))) #processedFilesCount += 1 bib_total_reference_count = 0 ocd.open_connection(caller_name="processBibliographies") for ref in bibReferences: bib_total_reference_count += 1 bib_entry = opasSolrLoadSupport.BiblioEntry( artInfo, ref) opasSolrLoadSupport.add_reference_to_biblioxml_table( ocd, artInfo, bib_entry) try: ocd.db.commit() except mysql.connector.Error as e: print("SQL Database -- Biblio Commit failed!", e) ocd.close_connection(caller_name="processBibliographies") # close the file, and do the next if options.display_verbose: print((" ...Time: %s seconds." % (time.time() - fileTimeStart))) print(f"Load process complete ({time.ctime()}).") if processed_files_count > 0: try: print("Performing final commit.") if not options.glossary_only: # options.fulltext_core_update: solr_docs2.commit() solr_authors2.commit() # fileTracker.commit() if 1: # options.glossary_core_update: solr_gloss2.commit() except Exception as e: print(("Exception: ", e)) else: # Use date time as seed, hoping multiple instances don't get here at the same time # but only if caller did not specify if randomizer_seed is None: randomizer_seed = int(datetime.utcnow().timestamp()) # end of docs, authors, and/or references Adds # write updated file if issue_updates != {}: # now seed randomizer, hopefull all instances have a different seed (or caller must force) random.seed(randomizer_seed) try: # temp exception block just until localsecrets has been updated with DATA_UPDATE_LOG_DIR try: fname = f"{localsecrets.DATA_UPDATE_LOG_DIR}/updated_issues_{dtime.datetime.now().strftime('%Y%m%d_%H%M%S')}({random.randint(1000,9999)}).xml" except Exception as e: fname = f"updated_issues_{dtime.datetime.now().strftime('%Y%m%d_%H%M%S')}({random.randint(1000,9999)}).xml" print(f"Issue updates. Writing file {fname}") with open(fname, 'w', encoding="utf8") as fo: fo.write(f'<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<issue_updates>\n') for k, a in issue_updates.items(): fo.write(f"\n\t<issue>\n\t\t{str(k)}\n\t\t<articles>\n") for ref in a: try: #ref = re.sub(ref, "([Q ])&([ A])", r"\1&\2", flags=re.IGNORECASE) fo.write(f"\t\t\t{ref}\n") except Exception as e: print(f"Issue Update Article Write Error: ({e})") fo.write("\t\t</articles>\n\t</issue>") fo.write('\n</issue_updates>') except Exception as e: print(f"Issue Update File Write Error: ({e})") # --------------------------------------------------------- # Closing time # --------------------------------------------------------- timeEnd = time.time() #currentfile_info.close() # for logging if 1: # (options.biblio_update or options.fulltext_core_update) == True: elapsed_seconds = timeEnd - cumulative_file_time_start # actual processing time going through files elapsed_minutes = elapsed_seconds / 60 if bib_total_reference_count > 0: msg = f"Finished! Imported {processed_files_count} documents and {bib_total_reference_count} references. Total file inspection/load time: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.) " logger.info(msg) print(msg) else: msg = f"Finished! Imported {processed_files_count} documents. Total file load time: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.)" logger.info(msg) print(msg) if processed_files_count > 0: msg = f"...Files loaded per Min: {processed_files_count/elapsed_minutes:.4f}" logger.info(msg) print(msg) msg = f"...Files evaluated per Min (includes skipped files): {len(filenames)/elapsed_minutes:.4f}" logger.info(msg) print(msg) elapsed_seconds = timeEnd - timeStart # actual processing time going through files elapsed_minutes = elapsed_seconds / 60 msg = f"Note: File load time is not total elapsed time. Total elapsed time is: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.)" logger.info(msg) print(msg) if processed_files_count > 0: msg = f"Files per elapsed min: {processed_files_count/elapsed_minutes:.4f}" logger.info(msg) print(msg)