def test_01(self): filespec = 'pep.css' fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.IMAGE_SOURCE_PATH) spec = fs.fullfilespec(filespec=filespec) assert spec == localsecrets.IMAGE_SOURCE_PATH + localsecrets.PATH_SEPARATOR + filespec, spec filespec = 'IJAPS.016.0181A.FIG002.jpg' fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.IMAGE_SOURCE_PATH) spec = fs.fullfilespec(filespec=filespec) assert spec == localsecrets.IMAGE_SOURCE_PATH + localsecrets.PATH_SEPARATOR + filespec, spec
def test_7_get_matching_filenames(self): pat = r"(.*?)\((bEXP_ARCH1|bSeriesTOC)\)\.(xml|XML)$" fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.XML_ORIGINALS_PATH) root = pathlib.Path(localsecrets.XML_ORIGINALS_PATH) testsubpath = "_PEPCurrent/IJP/" testfullpath = root / testsubpath matchlist = fs.get_matching_filelist(path=testfullpath, filespec_regex=pat, revised_after_date="2022-09-04") print(len(matchlist)) assert (len(matchlist) == 0) matchlist = fs.get_matching_filelist(path=testfullpath, filespec_regex=pat) print(len(matchlist)) assert (len(matchlist) >= 100) matchlist = fs.get_matching_filelist(path=testfullpath, filespec_regex=pat, max_items=20) print(len(matchlist)) assert (len(matchlist) == 20) matchlist = fs.get_matching_filelist(path=testfullpath, filespec_regex=pat, max_items=20) print(len(matchlist)) assert (len(matchlist) == 20)
def test_06(self): fs = opasFileSupport.FlexFileSystem( root=localsecrets.IMAGE_SOURCE_PATH ) # must be for the image if not the root fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.IMAGE_SOURCE_PATH) document_id = "IJAPS.016.0181A.FIG002.jpg" filename = fs.get_image_filename(filespec=document_id, path=localsecrets.IMAGE_SOURCE_PATH) if filename is None: print(f"file {document_id} doesn't exist") else: print(f"file {filename} exists") assert (filename is not None)
def test_04(self): filespec = "PEPTOPAUTHVS.001.0021A(bEXP_ARCH1).XML" # create text file fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.XML_ORIGINALS_PATH) filefound = fs.find(filespec) assert filefound != None
def test_02(self): fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.XML_ORIGINALS_PATH) filespec = "PEPTOPAUTHVS.001.0021A(bEXP_ARCH1).XML" fileinfo = fs.fileinfo(filespec, path="_PEPFree/PEPTOPAUTHVS") assert fileinfo.basename == filespec assert fileinfo.filesize == 16719 # permission problems when trying to open on stage fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.IMAGE_SOURCE_PATH) filespec = r"IJAPS.016.0181A.FIG002.jpg" fileinfo = fs.fileinfo(filespec) assert fileinfo.basename == filespec, fileinfo.basename assert fileinfo.filesize == 21064, fileinfo.filesize
def test_2_exists(self): fs = opasFileSupport.FlexFileSystem( root=localsecrets.IMAGE_SOURCE_PATH) ret = fs.exists(filespec="IJAPS.016.0181A.FIG002.jpg", path=localsecrets.IMAGE_SOURCE_PATH) assert (ret == True) ret = fs.exists(filespec="IJAPS.016.0181A.FIG002B.jpg", path=localsecrets.IMAGE_SOURCE_PATH) assert (ret == False)
def test_3_get_download_filename(self): """ """ fs = opasFileSupport.FlexFileSystem( root=localsecrets.PDF_ORIGINALS_PATH) filespec = "AIM.026.0021A.pdf" ret = fs.get_download_filename(filespec=filespec, path=localsecrets.PDF_ORIGINALS_PATH) print(ret) assert (filespec in ret)
def test_1_fetch_file_info(self): # get from s3 if localsecrets set to use it fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.XML_ORIGINALS_PATH) filename = "PEPTOPAUTHVS.001.0021A(bEXP_ARCH1).XML" filespec = fs.find(filename) ret = fs.fileinfo(filespec=filespec) assert (ret.filesize >= 16719)
def test_0_get_filespec(self): # get from s3 if localsecrets set to use it if localsecrets.S3_KEY is not None: # test AWS print("S3 FS tests") fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.IMAGE_SOURCE_PATH) ret = fs.fullfilespec(filespec="IJAPS.016.0181A.FIG002.jpg", path=localsecrets.IMAGE_SOURCE_PATH) assert (ret == 'pep-web-files/doc/g/IJAPS.016.0181A.FIG002.jpg') else: print("Local FS tests") fs = opasFileSupport.FlexFileSystem( root=localsecrets.XML_ORIGINALS_PATH) # >>> fs.fullfilespec(filespec="pep.css", path="embedded-graphics") 'pep-graphics/embedded-graphics/pep.css' ret = fs.fullfilespec(filespec="IJAPS.016.0181A.FIG002.jpg", path=localsecrets.IMAGE_SOURCE_PATH) assert (ret == 'X:\\_PEPA1\\g\\IJAPS.016.0181A.FIG002.jpg')
def test_4_get_image_filename(self): """ """ fs = opasFileSupport.FlexFileSystem( root=localsecrets.IMAGE_SOURCE_PATH ) # must be for the image if not the root filespec = "AIM.036.0275A.FIG001" ret = fs.get_image_filename(filespec=filespec, path=localsecrets.IMAGE_SOURCE_PATH) print(ret) assert (filespec in ret) ret = fs.get_image_filename(filespec=filespec) print(ret) assert (filespec in ret)
def test_0_exists(self): # This should work whether on local or S3 if localsecrets.S3_KEY is not None: # test AWS print("S3 FS tests") fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.XML_ORIGINALS_PATH) filename = "ADPSA.001.0007A(bEXP_ARCH1).XML" ret_val = fs.find(filename) print(ret_val) try: assert (filename in ret_val) except Exception as e: print(f"Except: {e}") assert (False)
def test_5_get_image_len(self): """ >>> fs = FlexFileSystem(key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET) >>> binimg = fs.get_image_binary(filespec="AIM.036.0275A.FIG001", path=localsecrets.IMAGE_SOURCE_PATH) >>> len(binimg) 26038 """ fs = opasFileSupport.FlexFileSystem( root=localsecrets.IMAGE_SOURCE_PATH ) # must be for the image if not the root filespec = "AIM.036.0275A.FIG001" img_bin = fs.get_image_binary(filespec=filespec, path=localsecrets.IMAGE_SOURCE_PATH) image_len = len(img_bin) print(image_len) assert (image_len >= 26038)
def test_03(self): filespec = 'test-file.txt' filespec2 = 'test-file.txt' # create text file fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.XML_ORIGINALS_PATH) if fs.exists(filespec): # delete in case it exists res = fs.delete(filespec=filespec, path=localsecrets.XML_ORIGINALS_PATH) # now create it fs.create_text_file(filespec=filespec) assert fs.exists(filespec) == True fs.rename(filespec, filespec2) assert fs.exists(filespec2) == True fs.delete(filespec=filespec2, path=localsecrets.XML_ORIGINALS_PATH) assert fs.exists(filespec2) == False
def test_6_get_file_contents(self): """ # left in for an example >> fs = FlexFileSystem(key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET) >> file_content = fs.get_file_contents(filespec='pep-web-xml/_PEPArchive/ADPSA/001.1926/ADPSA.001.0007A(bEXP_ARCH1).XML', path=None) >> a = len(file_content) >> print (a) 692 """ fs = opasFileSupport.FlexFileSystem( root=localsecrets.XML_ORIGINALS_PATH ) # must be for the image if not the root filespec = "ADPSA.001.0007A(bEXP_ARCH1).XML" content = fs.get_file_contents(filespec=filespec, path=localsecrets.XML_ORIGINALS_PATH) content_len = len(content) print(content_len) assert (content_len >= 691)
def test_05(self): fs = opasFileSupport.FlexFileSystem( key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=localsecrets.PDF_ORIGINALS_PATH ) # important to use this path, not the XML one! document_id = "RPSA.047.0605B" filename = fs.get_download_filename( filespec=document_id, path=localsecrets.PDF_ORIGINALS_PATH, year="2001", ext=".PDF") if filename is None: print(f"file {document_id} doesn't exist") else: print(f"file {filename} exists") assert (filename is None) assert (opasFileSupport.file_exists(document_id, year="2001", ext=".PDF") == False)
def main(): global options # so the information can be used in support functions cumulative_file_time_start = time.time() randomizer_seed = None # scriptSourcePath = os.path.dirname(os.path.realpath(__file__)) processed_files_count = 0 ocd = opasCentralDBLib.opasCentralDB() fs = opasFileSupport.FlexFileSystem(key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root="pep-web-xml") # set toplevel logger to specified loglevel logger = logging.getLogger() logger.setLevel(options.logLevel) # get local logger logger = logging.getLogger(programNameShort) logger.info('Started at %s', datetime.today().strftime('%Y-%m-%d %H:%M:%S"')) # logging.basicConfig(filename=logFilename, level=options.logLevel) solrurl_docs = None #solrurl_refs = None solrurl_authors = None solrurl_glossary = None if options.rootFolder == localsecrets.XML_ORIGINALS_PATH or options.rootFolder == None: start_folder = pathlib.Path(localsecrets.XML_ORIGINALS_PATH) else: start_folder = pathlib.Path(options.rootFolder) if 1: # (options.biblio_update or options.fulltext_core_update or options.glossary_core_update) == True: try: solrurl_docs = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_DOCS # e.g., http://localhost:8983/solr/ + pepwebdocs' solrurl_authors = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_AUTHORS solrurl_glossary = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_GLOSSARY # print("Logfile: ", logFilename) print("Messaging verbose: ", options.display_verbose) print("Input data Root: ", start_folder) print("Input data Subfolder: ", options.subFolder) print("Reset Core Data: ", options.resetCoreData) if options.forceRebuildAllFiles == True: msg = "Forced Rebuild - All files added, regardless of whether they are the same as in Solr." logger.info(msg) print(msg) print(80 * "*") print(f"Database will be updated. Location: {localsecrets.DBHOST}") if not options.glossary_only: # options.fulltext_core_update: print("Solr Full-Text Core will be updated: ", solrurl_docs) print("Solr Authors Core will be updated: ", solrurl_authors) if 1: # options.glossary_core_update: print("Solr Glossary Core will be updated: ", solrurl_glossary) print(80 * "*") if options.include_paras: print( "--includeparas option selected. Each paragraph will also be stored individually for *Docs* core. Increases core size markedly!" ) else: try: print( f"Paragraphs only stored for sources indicated in loaderConfig. Currently: [{', '.join(loaderConfig.src_codes_to_include_paras)}]" ) except: print( "Paragraphs only stored for sources indicated in loaderConfig." ) if options.halfway: print( "--halfway option selected. Processing approximately one-half of the files that match." ) if options.run_in_reverse: print( "--reverse option selected. Running the files found in reverse order." ) if options.file_key: print( f"--key supplied. Running for files matching the article id {options.file_key}" ) print(80 * "*") if not options.no_check: cont = input( "The above databases will be updated. Do you want to continue (y/n)?" ) if cont.lower() == "n": print("User requested exit. No data changed.") sys.exit(0) except Exception as e: msg = f"cores specification error ({e})." print((len(msg) * "-")) print(msg) print((len(msg) * "-")) sys.exit(0) # import data about the PEP codes for journals and books. # Codes are like APA, PAH, ... and special codes like ZBK000 for a particular book sourceDB = opasCentralDBLib.SourceInfoDB() solr_docs2 = None # The connection call is to solrpy (import was just solr) if localsecrets.SOLRUSER is not None and localsecrets.SOLRPW is not None: if 1: # options.fulltext_core_update: solr_docs2 = pysolr.Solr(solrurl_docs, auth=(localsecrets.SOLRUSER, localsecrets.SOLRPW)) else: # no user and password needed solr_docs2 = pysolr.Solr(solrurl_docs) # Reset core's data if requested (mainly for early development) if options.resetCoreData: if not options.glossary_only: # options.fulltext_core_update: msg = "*** Deleting all data from the docs and author cores and database tables ***" logger.warning(msg) print(msg) msg2 = "Biblio and Articles table contents will be reset" logger.info(msg2) print(msg2) ocd.delete_all_article_data() solr_docs2.delete(q='*:*') solr_docs2.commit() solr_authors2.delete(q="*:*") solr_authors2.commit() # reset glossary core when others are reset, or when --resetcore is selected with --glossaryonly if 1: # options.glossary_core_update: msg = "*** Deleting all data from the Glossary core ***" logger.warning(msg) print(msg) solr_gloss2.delete(q="*:*") solr_gloss2.commit() else: # check for missing files and delete them from the core, since we didn't empty the core above pass # Go through a set of XML files bib_total_reference_count = 0 # zero this here, it's checked at the end whether references are processed or not # ######################################################################## # Get list of files to process # ######################################################################## new_files = 0 total_files = 0 if options.subFolder is not None: start_folder = start_folder / pathlib.Path(options.subFolder) print( f"Locating files for processing at {start_folder} with pattern {loaderConfig.file_match_pattern}. Started at ({time.ctime()})." ) if options.file_key is not None: # print (f"File Key Specified: {options.file_key}") pat = fr"({options.file_key}.*){loaderConfig.file_match_pattern}" filenames = fs.get_matching_filelist(filespec_regex=pat, path=start_folder, max_items=1000) if len(filenames) is None: msg = f"File {pat} not found. Exiting." logger.warning(msg) print(msg) exit(0) else: options.forceRebuildAllFiles = True elif options.file_only is not None: fileinfo = FileInfo() filespec = options.file_only fileinfo.mapLocalFS(filespec) filenames = [fileinfo] else: pat = fr"(.*?){loaderConfig.file_match_pattern}" filenames = [] if filenames != []: total_files = len(filenames) new_files = len(filenames) else: # get a list of all the XML files that are new if options.forceRebuildAllFiles: # get a complete list of filenames for start_folder tree filenames = fs.get_matching_filelist(filespec_regex=pat, path=start_folder) else: filenames = fs.get_matching_filelist( filespec_regex=pat, path=start_folder, revised_after_date=options.created_after) print((80 * "-")) files_found = len(filenames) if options.forceRebuildAllFiles: #maybe do this only during core resets? #print ("Clearing database tables...") #ocd.delete_all_article_data() print( f"Ready to import records from {files_found} files at path {start_folder}" ) else: print( f"Ready to import {files_found} files *if modified* at path: {start_folder}" ) timeStart = time.time() print(f"Processing started at ({time.ctime()}).") print((80 * "-")) precommit_file_count = 0 skipped_files = 0 stop_after = 0 cumulative_file_time_start = time.time() issue_updates = {} if files_found > 0: if options.halfway: stop_after = round(files_found / 2) + 5 # go a bit further if options.run_in_reverse: filenames.reverse() # ---------------------------------------------------------------------- # Now walk through all the filenames selected # ---------------------------------------------------------------------- print(f"Load process started ({time.ctime()}). Examining files.") for n in filenames: fileTimeStart = time.time() file_updated = False if not options.forceRebuildAllFiles: if not options.display_verbose and processed_files_count % 100 == 0 and processed_files_count != 0: print( f"Processed Files ...loaded {processed_files_count} out of {files_found} possible." ) if not options.display_verbose and skipped_files % 100 == 0 and skipped_files != 0: print( f"Skipped {skipped_files} so far...loaded {processed_files_count} out of {files_found} possible." ) if file_is_same_as_in_solr(solr_docs2, filename=n.basename, timestamp_str=n.timestamp_str): skipped_files += 1 if options.display_verbose: print(f"Skipped - No refresh needed for {n.basename}") continue else: file_updated = True # get mod date/time, filesize, etc. for mysql database insert/update processed_files_count += 1 if stop_after > 0: if processed_files_count > stop_after: print( f"Halfway mark reached on file list ({stop_after})...file processing stopped per halfway option" ) break fileXMLContents = fs.get_file_contents(n.filespec) # get file basename without build (which is in paren) base = n.basename artID = os.path.splitext(base)[0] # watch out for comments in file name, like: # JICAP.018.0307A updated but no page breaks (bEXP_ARCH1).XML # so skip data after a space m = re.match(r"([^ ]*).*\(.*\)", artID) # Note: We could also get the artID from the XML, but since it's also important # the file names are correct, we'll do it here. Also, it "could" have been left out # of the artinfo (attribute), whereas the filename is always there. artID = m.group(1) # all IDs to upper case. artID = artID.upper() msg = "Processing file #%s of %s: %s (%s bytes). Art-ID:%s" % ( processed_files_count, files_found, base, n.filesize, artID) logger.info(msg) if options.display_verbose: print(msg) # import into lxml parser = lxml.etree.XMLParser(encoding='utf-8', recover=True, resolve_entities=True, load_dtd=True) root = etree.fromstring( opasxmllib.remove_encoding_string(fileXMLContents), parser) pepxml = root # save common document (article) field values into artInfo instance for both databases artInfo = opasSolrLoadSupport.ArticleInfo(sourceDB.sourceData, pepxml, artID, logger) artInfo.filedatetime = n.timestamp_str artInfo.filename = base artInfo.file_size = n.filesize artInfo.file_updated = file_updated # not a new journal, see if it's a new article. if opasSolrLoadSupport.add_to_tracker_table( ocd, artInfo.art_id): # if true, added successfully, so new! # don't log to issue updates for journals that are new sources added during the annual update if artInfo.src_code not in loaderConfig.DATA_UPDATE_PREPUBLICATION_CODES_TO_IGNORE: art = f"<article id='{artInfo.art_id}'>{artInfo.art_citeas_xml}</article>" try: issue_updates[artInfo.issue_id_str].append(art) except Exception as e: issue_updates[artInfo.issue_id_str] = [art] try: artInfo.file_classification = re.search( "(?P<class>current|archive|future|free|special|offsite)", str(n.filespec), re.IGNORECASE).group("class") # set it to lowercase for ease of matching later if artInfo.file_classification is not None: artInfo.file_classification = artInfo.file_classification.lower( ) except Exception as e: logger.warning( "Could not determine file classification for %s (%s)" % (n.filespec, e)) # walk through bib section and add to refs core database precommit_file_count += 1 if precommit_file_count > configLib.opasCoreConfig.COMMITLIMIT: print( f"Committing info for {configLib.opasCoreConfig.COMMITLIMIT} documents/articles" ) # input to the glossary if 1: # options.glossary_core_update: # load the glossary core if this is a glossary item glossary_file_pattern = r"ZBK.069(.*)\(bEXP_ARCH1\)\.(xml|XML)$" if re.match(glossary_file_pattern, n.basename): opasSolrLoadSupport.process_article_for_glossary_core( pepxml, artInfo, solr_gloss2, fileXMLContents, verbose=options.display_verbose) # input to the full-text and authors cores if not options.glossary_only: # options.fulltext_core_update: # load the docs (pepwebdocs) core opasSolrLoadSupport.process_article_for_doc_core( pepxml, artInfo, solr_docs2, fileXMLContents, include_paras=options.include_paras, verbose=options.display_verbose) # load the authors (pepwebauthors) core. opasSolrLoadSupport.process_info_for_author_core( pepxml, artInfo, solr_authors2, verbose=options.display_verbose) # load the database opasSolrLoadSupport.add_article_to_api_articles_table( ocd, artInfo, verbose=options.display_verbose) if precommit_file_count > configLib.opasCoreConfig.COMMITLIMIT: precommit_file_count = 0 solr_docs2.commit() solr_authors2.commit() # input to the references core if 1: # options.biblio_update: if artInfo.ref_count > 0: bibReferences = pepxml.xpath( "/pepkbd3//be" ) # this is the second time we do this (also in artinfo, but not sure or which is better per space vs time considerations) if options.display_verbose: print(( " ...Processing %s references for the references database." % (artInfo.ref_count))) #processedFilesCount += 1 bib_total_reference_count = 0 ocd.open_connection(caller_name="processBibliographies") for ref in bibReferences: bib_total_reference_count += 1 bib_entry = opasSolrLoadSupport.BiblioEntry( artInfo, ref) opasSolrLoadSupport.add_reference_to_biblioxml_table( ocd, artInfo, bib_entry) try: ocd.db.commit() except mysql.connector.Error as e: print("SQL Database -- Biblio Commit failed!", e) ocd.close_connection(caller_name="processBibliographies") # close the file, and do the next if options.display_verbose: print((" ...Time: %s seconds." % (time.time() - fileTimeStart))) print(f"Load process complete ({time.ctime()}).") if processed_files_count > 0: try: print("Performing final commit.") if not options.glossary_only: # options.fulltext_core_update: solr_docs2.commit() solr_authors2.commit() # fileTracker.commit() if 1: # options.glossary_core_update: solr_gloss2.commit() except Exception as e: print(("Exception: ", e)) else: # Use date time as seed, hoping multiple instances don't get here at the same time # but only if caller did not specify if randomizer_seed is None: randomizer_seed = int(datetime.utcnow().timestamp()) # end of docs, authors, and/or references Adds # write updated file if issue_updates != {}: # now seed randomizer, hopefull all instances have a different seed (or caller must force) random.seed(randomizer_seed) try: # temp exception block just until localsecrets has been updated with DATA_UPDATE_LOG_DIR try: fname = f"{localsecrets.DATA_UPDATE_LOG_DIR}/updated_issues_{dtime.datetime.now().strftime('%Y%m%d_%H%M%S')}({random.randint(1000,9999)}).xml" except Exception as e: fname = f"updated_issues_{dtime.datetime.now().strftime('%Y%m%d_%H%M%S')}({random.randint(1000,9999)}).xml" print(f"Issue updates. Writing file {fname}") with open(fname, 'w', encoding="utf8") as fo: fo.write(f'<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<issue_updates>\n') for k, a in issue_updates.items(): fo.write(f"\n\t<issue>\n\t\t{str(k)}\n\t\t<articles>\n") for ref in a: try: #ref = re.sub(ref, "([Q ])&([ A])", r"\1&\2", flags=re.IGNORECASE) fo.write(f"\t\t\t{ref}\n") except Exception as e: print(f"Issue Update Article Write Error: ({e})") fo.write("\t\t</articles>\n\t</issue>") fo.write('\n</issue_updates>') except Exception as e: print(f"Issue Update File Write Error: ({e})") # --------------------------------------------------------- # Closing time # --------------------------------------------------------- timeEnd = time.time() #currentfile_info.close() # for logging if 1: # (options.biblio_update or options.fulltext_core_update) == True: elapsed_seconds = timeEnd - cumulative_file_time_start # actual processing time going through files elapsed_minutes = elapsed_seconds / 60 if bib_total_reference_count > 0: msg = f"Finished! Imported {processed_files_count} documents and {bib_total_reference_count} references. Total file inspection/load time: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.) " logger.info(msg) print(msg) else: msg = f"Finished! Imported {processed_files_count} documents. Total file load time: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.)" logger.info(msg) print(msg) if processed_files_count > 0: msg = f"...Files loaded per Min: {processed_files_count/elapsed_minutes:.4f}" logger.info(msg) print(msg) msg = f"...Files evaluated per Min (includes skipped files): {len(filenames)/elapsed_minutes:.4f}" logger.info(msg) print(msg) elapsed_seconds = timeEnd - timeStart # actual processing time going through files elapsed_minutes = elapsed_seconds / 60 msg = f"Note: File load time is not total elapsed time. Total elapsed time is: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.)" logger.info(msg) print(msg) if processed_files_count > 0: msg = f"Files per elapsed min: {processed_files_count/elapsed_minutes:.4f}" logger.info(msg) print(msg)
import localsecrets import opasFileSupport import time CONTROL_FILE_PATH = localsecrets.XML_ORIGINALS_PATH FILE_RUN_FULL_UPDATE = "run-full-update.txt" FILE_RUN_CURRENT_UPDATE = "run-current-update.txt" FILE_RUN_FULL_REBUILD = "run-full-rebuild.txt" FILE_GO_LIVE = "run-send-to-production.txt" FILE_STOP = "run-stop-monitoring.txt" INTERVAL = 9 ACTION = 2 flex_fs = opasFileSupport.FlexFileSystem(key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root=CONTROL_FILE_PATH) def file_exists(document_id, path=CONTROL_FILE_PATH): ret_val = flex_fs.exists(filespec=document_id, path=path) return ret_val if file_exists(FILE_RUN_CURRENT_UPDATE, path=CONTROL_FILE_PATH): # check current and free folder subtree for new or updated data, process all the way to stage print("Run Update (current) and free subtrees") flex_fs.rename(FILE_RUN_CURRENT_UPDATE, FILE_RUN_CURRENT_UPDATE + "-running.txt") time.sleep(ACTION) # Placeholder for action call
def sitemapper( path: str = localsecrets.SITEMAP_PATH, # local path or bucket for AWS size: int = 8000, # records per file max_records: int = 200000, # max records clear_sitemap: bool = False): """ ## Function ### Generate a Sitemap for Google. ## Return Type Dictionary or SiteMapInfo model pointing to sitemapindex and list of files, e.g., { "siteMapIndex": 'pep-web-google/sitemapindex.xml', "siteMapList": [ "pep-web-google/sitemap1.xml", "pep-web-google/sitemap2.xml", "pep-web-google/sitemap3.xml", "pep-web-google/sitemap4.xml" ] } >>> ret = sitemapper(size=10, max_records=200) >>> ret["siteMapIndexFile"] 'pep-web-google/sitemapindex.xml' """ fs = opasFileSupport.FlexFileSystem(key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET, root="pep-web-xml") import opasSiteMap ret_val = {"siteMapIndexFile": "", "siteMapList": []} try: SITEMAP_OUTPUT_FILE = path + localsecrets.PATH_SEPARATOR + "sitemap" # don't include xml extension here, it's added SITEMAP_INDEX_FILE = path + localsecrets.PATH_SEPARATOR + "sitemapindex.xml" except Exception as e: raise Exception(f"Error {e}.") if clear_sitemap: try: matchlist = fs.get_matching_filelist(path=path, filespec_regex="sitemap.*", max_items=200) count = 0 for n in matchlist: count += 1 if count > MAX_FILES_TO_DELETE: # most files it will delete, just a precaution. break else: fs.delete(filespec=n.filespec) print(f"Deleted prior sitemap file: {n.filespec}") except Exception as e: logger.error(f"File cleanup error {e}") try: # returns a list of the sitemap files (since split) sitemap_list = opasSiteMap.metadata_export(SITEMAP_OUTPUT_FILE, total_records=max_records, records_per_file=size) opasSiteMap.opas_sitemap_index(output_file=SITEMAP_INDEX_FILE, sitemap_list=sitemap_list) ret_val["siteMapIndexFile"] = SITEMAP_INDEX_FILE ret_val["siteMapList"] = sitemap_list except Exception as e: ret_val = f"Sitemap Error: {e}" logger.error(ret_val) raise Exception(ret_val) return ret_val