def test_01(self):
        filespec = 'pep.css'
        fs = opasFileSupport.FlexFileSystem(
            key=localsecrets.S3_KEY,
            secret=localsecrets.S3_SECRET,
            root=localsecrets.IMAGE_SOURCE_PATH)
        spec = fs.fullfilespec(filespec=filespec)
        assert spec == localsecrets.IMAGE_SOURCE_PATH + localsecrets.PATH_SEPARATOR + filespec, spec

        filespec = 'IJAPS.016.0181A.FIG002.jpg'
        fs = opasFileSupport.FlexFileSystem(
            key=localsecrets.S3_KEY,
            secret=localsecrets.S3_SECRET,
            root=localsecrets.IMAGE_SOURCE_PATH)
        spec = fs.fullfilespec(filespec=filespec)
        assert spec == localsecrets.IMAGE_SOURCE_PATH + localsecrets.PATH_SEPARATOR + filespec, spec
Beispiel #2
0
    def test_7_get_matching_filenames(self):

        pat = r"(.*?)\((bEXP_ARCH1|bSeriesTOC)\)\.(xml|XML)$"
        fs = opasFileSupport.FlexFileSystem(
            key=localsecrets.S3_KEY,
            secret=localsecrets.S3_SECRET,
            root=localsecrets.XML_ORIGINALS_PATH)

        root = pathlib.Path(localsecrets.XML_ORIGINALS_PATH)
        testsubpath = "_PEPCurrent/IJP/"
        testfullpath = root / testsubpath

        matchlist = fs.get_matching_filelist(path=testfullpath,
                                             filespec_regex=pat,
                                             revised_after_date="2022-09-04")
        print(len(matchlist))
        assert (len(matchlist) == 0)

        matchlist = fs.get_matching_filelist(path=testfullpath,
                                             filespec_regex=pat)
        print(len(matchlist))
        assert (len(matchlist) >= 100)

        matchlist = fs.get_matching_filelist(path=testfullpath,
                                             filespec_regex=pat,
                                             max_items=20)
        print(len(matchlist))
        assert (len(matchlist) == 20)

        matchlist = fs.get_matching_filelist(path=testfullpath,
                                             filespec_regex=pat,
                                             max_items=20)
        print(len(matchlist))
        assert (len(matchlist) == 20)
    def test_06(self):
        fs = opasFileSupport.FlexFileSystem(
            root=localsecrets.IMAGE_SOURCE_PATH
        )  # must be for the image if not the root
        fs = opasFileSupport.FlexFileSystem(
            key=localsecrets.S3_KEY,
            secret=localsecrets.S3_SECRET,
            root=localsecrets.IMAGE_SOURCE_PATH)
        document_id = "IJAPS.016.0181A.FIG002.jpg"
        filename = fs.get_image_filename(filespec=document_id,
                                         path=localsecrets.IMAGE_SOURCE_PATH)
        if filename is None:
            print(f"file {document_id} doesn't exist")
        else:
            print(f"file {filename} exists")

        assert (filename is not None)
 def test_04(self):
     filespec = "PEPTOPAUTHVS.001.0021A(bEXP_ARCH1).XML"
     # create text file
     fs = opasFileSupport.FlexFileSystem(
         key=localsecrets.S3_KEY,
         secret=localsecrets.S3_SECRET,
         root=localsecrets.XML_ORIGINALS_PATH)
     filefound = fs.find(filespec)
     assert filefound != None
 def test_02(self):
     fs = opasFileSupport.FlexFileSystem(
         key=localsecrets.S3_KEY,
         secret=localsecrets.S3_SECRET,
         root=localsecrets.XML_ORIGINALS_PATH)
     filespec = "PEPTOPAUTHVS.001.0021A(bEXP_ARCH1).XML"
     fileinfo = fs.fileinfo(filespec, path="_PEPFree/PEPTOPAUTHVS")
     assert fileinfo.basename == filespec
     assert fileinfo.filesize == 16719
     #  permission problems when trying to open on stage
     fs = opasFileSupport.FlexFileSystem(
         key=localsecrets.S3_KEY,
         secret=localsecrets.S3_SECRET,
         root=localsecrets.IMAGE_SOURCE_PATH)
     filespec = r"IJAPS.016.0181A.FIG002.jpg"
     fileinfo = fs.fileinfo(filespec)
     assert fileinfo.basename == filespec, fileinfo.basename
     assert fileinfo.filesize == 21064, fileinfo.filesize
Beispiel #6
0
 def test_2_exists(self):
     fs = opasFileSupport.FlexFileSystem(
         root=localsecrets.IMAGE_SOURCE_PATH)
     ret = fs.exists(filespec="IJAPS.016.0181A.FIG002.jpg",
                     path=localsecrets.IMAGE_SOURCE_PATH)
     assert (ret == True)
     ret = fs.exists(filespec="IJAPS.016.0181A.FIG002B.jpg",
                     path=localsecrets.IMAGE_SOURCE_PATH)
     assert (ret == False)
Beispiel #7
0
 def test_3_get_download_filename(self):
     """
     """
     fs = opasFileSupport.FlexFileSystem(
         root=localsecrets.PDF_ORIGINALS_PATH)
     filespec = "AIM.026.0021A.pdf"
     ret = fs.get_download_filename(filespec=filespec,
                                    path=localsecrets.PDF_ORIGINALS_PATH)
     print(ret)
     assert (filespec in ret)
Beispiel #8
0
    def test_1_fetch_file_info(self):
        # get from s3 if localsecrets set to use it
        fs = opasFileSupport.FlexFileSystem(
            key=localsecrets.S3_KEY,
            secret=localsecrets.S3_SECRET,
            root=localsecrets.XML_ORIGINALS_PATH)

        filename = "PEPTOPAUTHVS.001.0021A(bEXP_ARCH1).XML"
        filespec = fs.find(filename)
        ret = fs.fileinfo(filespec=filespec)
        assert (ret.filesize >= 16719)
Beispiel #9
0
 def test_0_get_filespec(self):
     # get from s3 if localsecrets set to use it
     if localsecrets.S3_KEY is not None:  #  test AWS
         print("S3 FS tests")
         fs = opasFileSupport.FlexFileSystem(
             key=localsecrets.S3_KEY,
             secret=localsecrets.S3_SECRET,
             root=localsecrets.IMAGE_SOURCE_PATH)
         ret = fs.fullfilespec(filespec="IJAPS.016.0181A.FIG002.jpg",
                               path=localsecrets.IMAGE_SOURCE_PATH)
         assert (ret == 'pep-web-files/doc/g/IJAPS.016.0181A.FIG002.jpg')
     else:
         print("Local FS tests")
         fs = opasFileSupport.FlexFileSystem(
             root=localsecrets.XML_ORIGINALS_PATH)
         # >>> fs.fullfilespec(filespec="pep.css", path="embedded-graphics")
         'pep-graphics/embedded-graphics/pep.css'
         ret = fs.fullfilespec(filespec="IJAPS.016.0181A.FIG002.jpg",
                               path=localsecrets.IMAGE_SOURCE_PATH)
         assert (ret == 'X:\\_PEPA1\\g\\IJAPS.016.0181A.FIG002.jpg')
Beispiel #10
0
 def test_4_get_image_filename(self):
     """
     """
     fs = opasFileSupport.FlexFileSystem(
         root=localsecrets.IMAGE_SOURCE_PATH
     )  # must be for the image if not the root
     filespec = "AIM.036.0275A.FIG001"
     ret = fs.get_image_filename(filespec=filespec,
                                 path=localsecrets.IMAGE_SOURCE_PATH)
     print(ret)
     assert (filespec in ret)
     ret = fs.get_image_filename(filespec=filespec)
     print(ret)
     assert (filespec in ret)
Beispiel #11
0
 def test_0_exists(self):
     # This should work whether on local or S3
     if localsecrets.S3_KEY is not None:  #  test AWS
         print("S3 FS tests")
     fs = opasFileSupport.FlexFileSystem(
         key=localsecrets.S3_KEY,
         secret=localsecrets.S3_SECRET,
         root=localsecrets.XML_ORIGINALS_PATH)
     filename = "ADPSA.001.0007A(bEXP_ARCH1).XML"
     ret_val = fs.find(filename)
     print(ret_val)
     try:
         assert (filename in ret_val)
     except Exception as e:
         print(f"Except: {e}")
         assert (False)
Beispiel #12
0
 def test_5_get_image_len(self):
     """
     >>> fs = FlexFileSystem(key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET)
     >>> binimg = fs.get_image_binary(filespec="AIM.036.0275A.FIG001", path=localsecrets.IMAGE_SOURCE_PATH)
     >>> len(binimg)
     26038
     
     """
     fs = opasFileSupport.FlexFileSystem(
         root=localsecrets.IMAGE_SOURCE_PATH
     )  # must be for the image if not the root
     filespec = "AIM.036.0275A.FIG001"
     img_bin = fs.get_image_binary(filespec=filespec,
                                   path=localsecrets.IMAGE_SOURCE_PATH)
     image_len = len(img_bin)
     print(image_len)
     assert (image_len >= 26038)
 def test_03(self):
     filespec = 'test-file.txt'
     filespec2 = 'test-file.txt'
     # create text file
     fs = opasFileSupport.FlexFileSystem(
         key=localsecrets.S3_KEY,
         secret=localsecrets.S3_SECRET,
         root=localsecrets.XML_ORIGINALS_PATH)
     if fs.exists(filespec):
         # delete in case it exists
         res = fs.delete(filespec=filespec,
                         path=localsecrets.XML_ORIGINALS_PATH)
     # now create it
     fs.create_text_file(filespec=filespec)
     assert fs.exists(filespec) == True
     fs.rename(filespec, filespec2)
     assert fs.exists(filespec2) == True
     fs.delete(filespec=filespec2, path=localsecrets.XML_ORIGINALS_PATH)
     assert fs.exists(filespec2) == False
Beispiel #14
0
 def test_6_get_file_contents(self):
     """
     # left in for an example
     >> fs = FlexFileSystem(key=localsecrets.S3_KEY, secret=localsecrets.S3_SECRET)
     >> file_content = fs.get_file_contents(filespec='pep-web-xml/_PEPArchive/ADPSA/001.1926/ADPSA.001.0007A(bEXP_ARCH1).XML', path=None)
     >> a = len(file_content)
     >> print (a)
     692
     
     """
     fs = opasFileSupport.FlexFileSystem(
         root=localsecrets.XML_ORIGINALS_PATH
     )  # must be for the image if not the root
     filespec = "ADPSA.001.0007A(bEXP_ARCH1).XML"
     content = fs.get_file_contents(filespec=filespec,
                                    path=localsecrets.XML_ORIGINALS_PATH)
     content_len = len(content)
     print(content_len)
     assert (content_len >= 691)
    def test_05(self):
        fs = opasFileSupport.FlexFileSystem(
            key=localsecrets.S3_KEY,
            secret=localsecrets.S3_SECRET,
            root=localsecrets.PDF_ORIGINALS_PATH
        )  # important to use this path, not the XML one!
        document_id = "RPSA.047.0605B"
        filename = fs.get_download_filename(
            filespec=document_id,
            path=localsecrets.PDF_ORIGINALS_PATH,
            year="2001",
            ext=".PDF")
        if filename is None:
            print(f"file {document_id} doesn't exist")
        else:
            print(f"file {filename} exists")

        assert (filename is None)
        assert (opasFileSupport.file_exists(document_id,
                                            year="2001",
                                            ext=".PDF") == False)
Beispiel #16
0
def main():

    global options  # so the information can be used in support functions

    cumulative_file_time_start = time.time()
    randomizer_seed = None

    # scriptSourcePath = os.path.dirname(os.path.realpath(__file__))

    processed_files_count = 0
    ocd = opasCentralDBLib.opasCentralDB()
    fs = opasFileSupport.FlexFileSystem(key=localsecrets.S3_KEY,
                                        secret=localsecrets.S3_SECRET,
                                        root="pep-web-xml")

    # set toplevel logger to specified loglevel
    logger = logging.getLogger()
    logger.setLevel(options.logLevel)
    # get local logger
    logger = logging.getLogger(programNameShort)

    logger.info('Started at %s',
                datetime.today().strftime('%Y-%m-%d %H:%M:%S"'))
    # logging.basicConfig(filename=logFilename, level=options.logLevel)

    solrurl_docs = None
    #solrurl_refs = None
    solrurl_authors = None
    solrurl_glossary = None
    if options.rootFolder == localsecrets.XML_ORIGINALS_PATH or options.rootFolder == None:
        start_folder = pathlib.Path(localsecrets.XML_ORIGINALS_PATH)
    else:
        start_folder = pathlib.Path(options.rootFolder)

    if 1:  # (options.biblio_update or options.fulltext_core_update or options.glossary_core_update) == True:
        try:
            solrurl_docs = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_DOCS  # e.g., http://localhost:8983/solr/    + pepwebdocs'
            solrurl_authors = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_AUTHORS
            solrurl_glossary = localsecrets.SOLRURL + configLib.opasCoreConfig.SOLR_GLOSSARY
            # print("Logfile: ", logFilename)
            print("Messaging verbose: ", options.display_verbose)
            print("Input data Root: ", start_folder)
            print("Input data Subfolder: ", options.subFolder)
            print("Reset Core Data: ", options.resetCoreData)
            if options.forceRebuildAllFiles == True:
                msg = "Forced Rebuild - All files added, regardless of whether they are the same as in Solr."
                logger.info(msg)
                print(msg)

            print(80 * "*")
            print(f"Database will be updated. Location: {localsecrets.DBHOST}")
            if not options.glossary_only:  # options.fulltext_core_update:
                print("Solr Full-Text Core will be updated: ", solrurl_docs)
                print("Solr Authors Core will be updated: ", solrurl_authors)
            if 1:  # options.glossary_core_update:
                print("Solr Glossary Core will be updated: ", solrurl_glossary)

            print(80 * "*")
            if options.include_paras:
                print(
                    "--includeparas option selected. Each paragraph will also be stored individually for *Docs* core. Increases core size markedly!"
                )
            else:
                try:
                    print(
                        f"Paragraphs only stored for sources indicated in loaderConfig. Currently: [{', '.join(loaderConfig.src_codes_to_include_paras)}]"
                    )
                except:
                    print(
                        "Paragraphs only stored for sources indicated in loaderConfig."
                    )

            if options.halfway:
                print(
                    "--halfway option selected.  Processing approximately one-half of the files that match."
                )

            if options.run_in_reverse:
                print(
                    "--reverse option selected.  Running the files found in reverse order."
                )

            if options.file_key:
                print(
                    f"--key supplied.  Running for files matching the article id {options.file_key}"
                )

            print(80 * "*")
            if not options.no_check:
                cont = input(
                    "The above databases will be updated.  Do you want to continue (y/n)?"
                )
                if cont.lower() == "n":
                    print("User requested exit.  No data changed.")
                    sys.exit(0)

        except Exception as e:
            msg = f"cores specification error ({e})."
            print((len(msg) * "-"))
            print(msg)
            print((len(msg) * "-"))
            sys.exit(0)

    # import data about the PEP codes for journals and books.
    #  Codes are like APA, PAH, ... and special codes like ZBK000 for a particular book
    sourceDB = opasCentralDBLib.SourceInfoDB()
    solr_docs2 = None
    # The connection call is to solrpy (import was just solr)
    if localsecrets.SOLRUSER is not None and localsecrets.SOLRPW is not None:
        if 1:  # options.fulltext_core_update:
            solr_docs2 = pysolr.Solr(solrurl_docs,
                                     auth=(localsecrets.SOLRUSER,
                                           localsecrets.SOLRPW))
    else:  #  no user and password needed
        solr_docs2 = pysolr.Solr(solrurl_docs)

    # Reset core's data if requested (mainly for early development)
    if options.resetCoreData:
        if not options.glossary_only:  # options.fulltext_core_update:
            msg = "*** Deleting all data from the docs and author cores and database tables ***"
            logger.warning(msg)
            print(msg)
            msg2 = "Biblio and Articles table contents will be reset"
            logger.info(msg2)
            print(msg2)
            ocd.delete_all_article_data()
            solr_docs2.delete(q='*:*')
            solr_docs2.commit()
            solr_authors2.delete(q="*:*")
            solr_authors2.commit()

        # reset glossary core when others are reset, or when --resetcore is selected with --glossaryonly
        if 1:  # options.glossary_core_update:
            msg = "*** Deleting all data from the Glossary core ***"
            logger.warning(msg)
            print(msg)
            solr_gloss2.delete(q="*:*")
            solr_gloss2.commit()
    else:
        # check for missing files and delete them from the core, since we didn't empty the core above
        pass

    # Go through a set of XML files
    bib_total_reference_count = 0  # zero this here, it's checked at the end whether references are processed or not

    # ########################################################################
    # Get list of files to process
    # ########################################################################
    new_files = 0
    total_files = 0

    if options.subFolder is not None:
        start_folder = start_folder / pathlib.Path(options.subFolder)

    print(
        f"Locating files for processing at {start_folder} with pattern {loaderConfig.file_match_pattern}. Started at ({time.ctime()})."
    )
    if options.file_key is not None:
        # print (f"File Key Specified: {options.file_key}")
        pat = fr"({options.file_key}.*){loaderConfig.file_match_pattern}"
        filenames = fs.get_matching_filelist(filespec_regex=pat,
                                             path=start_folder,
                                             max_items=1000)
        if len(filenames) is None:
            msg = f"File {pat} not found.  Exiting."
            logger.warning(msg)
            print(msg)
            exit(0)
        else:
            options.forceRebuildAllFiles = True
    elif options.file_only is not None:
        fileinfo = FileInfo()
        filespec = options.file_only
        fileinfo.mapLocalFS(filespec)
        filenames = [fileinfo]
    else:
        pat = fr"(.*?){loaderConfig.file_match_pattern}"
        filenames = []

    if filenames != []:
        total_files = len(filenames)
        new_files = len(filenames)
    else:
        # get a list of all the XML files that are new
        if options.forceRebuildAllFiles:
            # get a complete list of filenames for start_folder tree
            filenames = fs.get_matching_filelist(filespec_regex=pat,
                                                 path=start_folder)
        else:
            filenames = fs.get_matching_filelist(
                filespec_regex=pat,
                path=start_folder,
                revised_after_date=options.created_after)

    print((80 * "-"))
    files_found = len(filenames)
    if options.forceRebuildAllFiles:
        #maybe do this only during core resets?
        #print ("Clearing database tables...")
        #ocd.delete_all_article_data()
        print(
            f"Ready to import records from {files_found} files at path {start_folder}"
        )
    else:
        print(
            f"Ready to import {files_found} files *if modified* at path: {start_folder}"
        )

    timeStart = time.time()
    print(f"Processing started at ({time.ctime()}).")

    print((80 * "-"))
    precommit_file_count = 0
    skipped_files = 0
    stop_after = 0
    cumulative_file_time_start = time.time()
    issue_updates = {}
    if files_found > 0:
        if options.halfway:
            stop_after = round(files_found / 2) + 5  # go a bit further

        if options.run_in_reverse:
            filenames.reverse()

        # ----------------------------------------------------------------------
        # Now walk through all the filenames selected
        # ----------------------------------------------------------------------
        print(f"Load process started ({time.ctime()}).  Examining files.")

        for n in filenames:
            fileTimeStart = time.time()
            file_updated = False
            if not options.forceRebuildAllFiles:
                if not options.display_verbose and processed_files_count % 100 == 0 and processed_files_count != 0:
                    print(
                        f"Processed Files ...loaded {processed_files_count} out of {files_found} possible."
                    )

                if not options.display_verbose and skipped_files % 100 == 0 and skipped_files != 0:
                    print(
                        f"Skipped {skipped_files} so far...loaded {processed_files_count} out of {files_found} possible."
                    )

                if file_is_same_as_in_solr(solr_docs2,
                                           filename=n.basename,
                                           timestamp_str=n.timestamp_str):
                    skipped_files += 1
                    if options.display_verbose:
                        print(f"Skipped - No refresh needed for {n.basename}")
                    continue
                else:
                    file_updated = True

            # get mod date/time, filesize, etc. for mysql database insert/update
            processed_files_count += 1
            if stop_after > 0:
                if processed_files_count > stop_after:
                    print(
                        f"Halfway mark reached on file list ({stop_after})...file processing stopped per halfway option"
                    )
                    break

            fileXMLContents = fs.get_file_contents(n.filespec)

            # get file basename without build (which is in paren)
            base = n.basename
            artID = os.path.splitext(base)[0]
            # watch out for comments in file name, like:
            #   JICAP.018.0307A updated but no page breaks (bEXP_ARCH1).XML
            #   so skip data after a space
            m = re.match(r"([^ ]*).*\(.*\)", artID)
            # Note: We could also get the artID from the XML, but since it's also important
            # the file names are correct, we'll do it here.  Also, it "could" have been left out
            # of the artinfo (attribute), whereas the filename is always there.
            artID = m.group(1)
            # all IDs to upper case.
            artID = artID.upper()
            msg = "Processing file #%s of %s: %s (%s bytes). Art-ID:%s" % (
                processed_files_count, files_found, base, n.filesize, artID)
            logger.info(msg)
            if options.display_verbose:
                print(msg)

            # import into lxml
            parser = lxml.etree.XMLParser(encoding='utf-8',
                                          recover=True,
                                          resolve_entities=True,
                                          load_dtd=True)
            root = etree.fromstring(
                opasxmllib.remove_encoding_string(fileXMLContents), parser)
            pepxml = root

            # save common document (article) field values into artInfo instance for both databases
            artInfo = opasSolrLoadSupport.ArticleInfo(sourceDB.sourceData,
                                                      pepxml, artID, logger)
            artInfo.filedatetime = n.timestamp_str
            artInfo.filename = base
            artInfo.file_size = n.filesize
            artInfo.file_updated = file_updated
            # not a new journal, see if it's a new article.
            if opasSolrLoadSupport.add_to_tracker_table(
                    ocd,
                    artInfo.art_id):  # if true, added successfully, so new!
                # don't log to issue updates for journals that are new sources added during the annual update
                if artInfo.src_code not in loaderConfig.DATA_UPDATE_PREPUBLICATION_CODES_TO_IGNORE:
                    art = f"<article id='{artInfo.art_id}'>{artInfo.art_citeas_xml}</article>"
                    try:
                        issue_updates[artInfo.issue_id_str].append(art)
                    except Exception as e:
                        issue_updates[artInfo.issue_id_str] = [art]

            try:
                artInfo.file_classification = re.search(
                    "(?P<class>current|archive|future|free|special|offsite)",
                    str(n.filespec), re.IGNORECASE).group("class")
                # set it to lowercase for ease of matching later
                if artInfo.file_classification is not None:
                    artInfo.file_classification = artInfo.file_classification.lower(
                    )
            except Exception as e:
                logger.warning(
                    "Could not determine file classification for %s (%s)" %
                    (n.filespec, e))

            # walk through bib section and add to refs core database

            precommit_file_count += 1
            if precommit_file_count > configLib.opasCoreConfig.COMMITLIMIT:
                print(
                    f"Committing info for {configLib.opasCoreConfig.COMMITLIMIT} documents/articles"
                )

            # input to the glossary
            if 1:  # options.glossary_core_update:
                # load the glossary core if this is a glossary item
                glossary_file_pattern = r"ZBK.069(.*)\(bEXP_ARCH1\)\.(xml|XML)$"
                if re.match(glossary_file_pattern, n.basename):
                    opasSolrLoadSupport.process_article_for_glossary_core(
                        pepxml,
                        artInfo,
                        solr_gloss2,
                        fileXMLContents,
                        verbose=options.display_verbose)

            # input to the full-text and authors cores
            if not options.glossary_only:  # options.fulltext_core_update:
                # load the docs (pepwebdocs) core
                opasSolrLoadSupport.process_article_for_doc_core(
                    pepxml,
                    artInfo,
                    solr_docs2,
                    fileXMLContents,
                    include_paras=options.include_paras,
                    verbose=options.display_verbose)
                # load the authors (pepwebauthors) core.
                opasSolrLoadSupport.process_info_for_author_core(
                    pepxml,
                    artInfo,
                    solr_authors2,
                    verbose=options.display_verbose)
                # load the database
                opasSolrLoadSupport.add_article_to_api_articles_table(
                    ocd, artInfo, verbose=options.display_verbose)

                if precommit_file_count > configLib.opasCoreConfig.COMMITLIMIT:
                    precommit_file_count = 0
                    solr_docs2.commit()
                    solr_authors2.commit()

            # input to the references core
            if 1:  # options.biblio_update:
                if artInfo.ref_count > 0:
                    bibReferences = pepxml.xpath(
                        "/pepkbd3//be"
                    )  # this is the second time we do this (also in artinfo, but not sure or which is better per space vs time considerations)
                    if options.display_verbose:
                        print((
                            "   ...Processing %s references for the references database."
                            % (artInfo.ref_count)))

                    #processedFilesCount += 1
                    bib_total_reference_count = 0
                    ocd.open_connection(caller_name="processBibliographies")
                    for ref in bibReferences:
                        bib_total_reference_count += 1
                        bib_entry = opasSolrLoadSupport.BiblioEntry(
                            artInfo, ref)
                        opasSolrLoadSupport.add_reference_to_biblioxml_table(
                            ocd, artInfo, bib_entry)

                    try:
                        ocd.db.commit()
                    except mysql.connector.Error as e:
                        print("SQL Database -- Biblio Commit failed!", e)

                    ocd.close_connection(caller_name="processBibliographies")

            # close the file, and do the next
            if options.display_verbose:
                print(("   ...Time: %s seconds." %
                       (time.time() - fileTimeStart)))

        print(f"Load process complete ({time.ctime()}).")
        if processed_files_count > 0:
            try:
                print("Performing final commit.")
                if not options.glossary_only:  # options.fulltext_core_update:
                    solr_docs2.commit()
                    solr_authors2.commit()
                    # fileTracker.commit()
                if 1:  # options.glossary_core_update:
                    solr_gloss2.commit()
            except Exception as e:
                print(("Exception: ", e))
            else:
                # Use date time as seed, hoping multiple instances don't get here at the same time
                # but only if caller did not specify
                if randomizer_seed is None:
                    randomizer_seed = int(datetime.utcnow().timestamp())

    # end of docs, authors, and/or references Adds

    # write updated file
    if issue_updates != {}:
        # now seed randomizer, hopefull all instances have a different seed (or caller must force)
        random.seed(randomizer_seed)
        try:
            # temp exception block just until localsecrets has been updated with DATA_UPDATE_LOG_DIR
            try:
                fname = f"{localsecrets.DATA_UPDATE_LOG_DIR}/updated_issues_{dtime.datetime.now().strftime('%Y%m%d_%H%M%S')}({random.randint(1000,9999)}).xml"
            except Exception as e:
                fname = f"updated_issues_{dtime.datetime.now().strftime('%Y%m%d_%H%M%S')}({random.randint(1000,9999)}).xml"

            print(f"Issue updates.  Writing file {fname}")
            with open(fname, 'w', encoding="utf8") as fo:
                fo.write(f'<?xml version="1.0" encoding="UTF-8"?>\n')
                fo.write('<issue_updates>\n')
                for k, a in issue_updates.items():
                    fo.write(f"\n\t<issue>\n\t\t{str(k)}\n\t\t<articles>\n")
                    for ref in a:
                        try:
                            #ref = re.sub(ref, "([Q ])&([ A])", r"\1&amp;\2", flags=re.IGNORECASE)
                            fo.write(f"\t\t\t{ref}\n")
                        except Exception as e:
                            print(f"Issue Update Article Write Error: ({e})")

                    fo.write("\t\t</articles>\n\t</issue>")
                fo.write('\n</issue_updates>')

        except Exception as e:
            print(f"Issue Update File Write Error: ({e})")

    # ---------------------------------------------------------
    # Closing time
    # ---------------------------------------------------------
    timeEnd = time.time()
    #currentfile_info.close()

    # for logging
    if 1:  # (options.biblio_update or options.fulltext_core_update) == True:
        elapsed_seconds = timeEnd - cumulative_file_time_start  # actual processing time going through files
        elapsed_minutes = elapsed_seconds / 60
        if bib_total_reference_count > 0:
            msg = f"Finished! Imported {processed_files_count} documents and {bib_total_reference_count} references. Total file inspection/load time: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.) "
            logger.info(msg)
            print(msg)
        else:
            msg = f"Finished! Imported {processed_files_count} documents. Total file load time: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.)"
            logger.info(msg)
            print(msg)
        if processed_files_count > 0:
            msg = f"...Files loaded per Min: {processed_files_count/elapsed_minutes:.4f}"
            logger.info(msg)
            print(msg)
            msg = f"...Files evaluated per Min (includes skipped files): {len(filenames)/elapsed_minutes:.4f}"
            logger.info(msg)
            print(msg)

    elapsed_seconds = timeEnd - timeStart  # actual processing time going through files
    elapsed_minutes = elapsed_seconds / 60
    msg = f"Note: File load time is not total elapsed time. Total elapsed time is: {elapsed_seconds:.2f} secs ({elapsed_minutes:.2f} minutes.)"
    logger.info(msg)
    print(msg)
    if processed_files_count > 0:
        msg = f"Files per elapsed min: {processed_files_count/elapsed_minutes:.4f}"
        logger.info(msg)
        print(msg)
import localsecrets
import opasFileSupport
import time
CONTROL_FILE_PATH = localsecrets.XML_ORIGINALS_PATH

FILE_RUN_FULL_UPDATE = "run-full-update.txt"
FILE_RUN_CURRENT_UPDATE = "run-current-update.txt"
FILE_RUN_FULL_REBUILD = "run-full-rebuild.txt"
FILE_GO_LIVE = "run-send-to-production.txt"
FILE_STOP = "run-stop-monitoring.txt"
INTERVAL = 9
ACTION = 2

flex_fs = opasFileSupport.FlexFileSystem(key=localsecrets.S3_KEY,
                                         secret=localsecrets.S3_SECRET,
                                         root=CONTROL_FILE_PATH)


def file_exists(document_id, path=CONTROL_FILE_PATH):
    ret_val = flex_fs.exists(filespec=document_id, path=path)
    return ret_val


if file_exists(FILE_RUN_CURRENT_UPDATE, path=CONTROL_FILE_PATH):
    # check current and free folder subtree for new or updated data, process all the way to stage
    print("Run Update (current) and free subtrees")
    flex_fs.rename(FILE_RUN_CURRENT_UPDATE,
                   FILE_RUN_CURRENT_UPDATE + "-running.txt")

    time.sleep(ACTION)  # Placeholder for action call
Beispiel #18
0
def sitemapper(
        path: str = localsecrets.SITEMAP_PATH,  # local path or bucket for AWS
        size: int = 8000,  # records per file
        max_records: int = 200000,  # max records
        clear_sitemap: bool = False):
    """
    ## Function
       ### Generate a Sitemap for Google.

    ## Return Type
       Dictionary or SiteMapInfo model pointing to sitemapindex and list of files,
       e.g.,
       
            {
               "siteMapIndex": 'pep-web-google/sitemapindex.xml',
               "siteMapList": [
                 "pep-web-google/sitemap1.xml",
                 "pep-web-google/sitemap2.xml",
                 "pep-web-google/sitemap3.xml",
                 "pep-web-google/sitemap4.xml"
               ]
            }

    >>> ret = sitemapper(size=10, max_records=200)
    >>> ret["siteMapIndexFile"]
    'pep-web-google/sitemapindex.xml'
  
    """
    fs = opasFileSupport.FlexFileSystem(key=localsecrets.S3_KEY,
                                        secret=localsecrets.S3_SECRET,
                                        root="pep-web-xml")
    import opasSiteMap
    ret_val = {"siteMapIndexFile": "", "siteMapList": []}

    try:
        SITEMAP_OUTPUT_FILE = path + localsecrets.PATH_SEPARATOR + "sitemap"  # don't include xml extension here, it's added
        SITEMAP_INDEX_FILE = path + localsecrets.PATH_SEPARATOR + "sitemapindex.xml"
    except Exception as e:
        raise Exception(f"Error {e}.")

    if clear_sitemap:
        try:
            matchlist = fs.get_matching_filelist(path=path,
                                                 filespec_regex="sitemap.*",
                                                 max_items=200)
            count = 0
            for n in matchlist:
                count += 1
                if count > MAX_FILES_TO_DELETE:  # most files it will delete, just a precaution.
                    break
                else:
                    fs.delete(filespec=n.filespec)
                    print(f"Deleted prior sitemap file: {n.filespec}")
        except Exception as e:
            logger.error(f"File cleanup error {e}")

    try:
        # returns a list of the sitemap files (since split)
        sitemap_list = opasSiteMap.metadata_export(SITEMAP_OUTPUT_FILE,
                                                   total_records=max_records,
                                                   records_per_file=size)
        opasSiteMap.opas_sitemap_index(output_file=SITEMAP_INDEX_FILE,
                                       sitemap_list=sitemap_list)
        ret_val["siteMapIndexFile"] = SITEMAP_INDEX_FILE
        ret_val["siteMapList"] = sitemap_list

    except Exception as e:
        ret_val = f"Sitemap Error: {e}"
        logger.error(ret_val)
        raise Exception(ret_val)

    return ret_val