コード例 #1
0
def processAllRecordsSTR(pathToMemex):
    files = functions.dicOfRelevantFiles(pathToMemex, ".bib")
    citeKeys = list(files.keys())
    random.shuffle(citeKeys)

    for citeKey in citeKeys:
        print(citeKey)
        bibData = functions.loadBib(files[citeKey])
        if "pagetotal" in bibData:
            pageTotal = int(bibData["pagetotal"])
            if pageTotal <= int(settings["page_limit"]):
                language = functions.identifyLanguage(bibData[citeKey], "eng")
                ocrPublication(citeKey, language, settings["page_limit"])
        else:
            language = functions.identifyLanguage(bibData[citeKey], "eng")
            ocrPublication(citeKey, language, settings["page_limit"])

    functions.memexStatusUpdates(settings["path_to_memex"], ".pdf")
    functions.memexStatusUpdates(settings["path_to_memex"], ".bib")
    functions.memexStatusUpdates(settings["path_to_memex"], ".png")
    functions.memexStatusUpdates(settings["path_to_memex"], ".json")
コード例 #2
0
def processAllRecords(bibDataFile):
    bibData = functions.loadBib(bibDataFile)
    keys = list(bibData.keys())
    random.shuffle(keys)

    for key in keys:
        bibRecord = bibData[key]
        functions.processBibRecord(settings["path_to_memex"], bibRecord)
        language = functions.identifyLanguage(bibRecord["rCite"], "eng")
        ocrPublication(bibRecord["rCite"], language)

    functions.memexStatusUpdates(settings["path_to_memex"], ".pdf")
    functions.memexStatusUpdates(settings["path_to_memex"], ".bib")
    functions.memexStatusUpdates(settings["path_to_memex"], ".png")
    functions.memexStatusUpdates(settings["path_to_memex"], ".json")
コード例 #3
0
def processAllRecords(bibDataFile):
    # load the bib file as dictionary using the function from previous step
    bibData = functions.loadBib(bibDataFile)
    # save the keys of the dictionary bibData as a list
    keys = list(bibData.keys())
    random.shuffle(keys)
    print
    print(str(keys))
    # in a loop, process each key from the list keys (i.e. each record by citation key)
    for key in keys:
        bibRecord = bibData[key]
        # run the function from the previous step that creates a path with pdf and bib files, if not already there
        functions.processBibRecord(settings["path_to_memex"], bibRecord)
        language = functions.identifyLanguage(bibRecord, "eng")
        # run the function that saves ocr-ed text as json files and created .png images for each page
        ocrPublication(bibRecord["rCite"], language)
コード例 #4
0
ファイル: 2_OCR.py プロジェクト: Smrcekd/Memex_Sandbox_G
def processAllRecords(bibDataFile):  #defines a functions for all the records
    bibData = functions.loadBib(
        bibDataFile)  #loops through key-value-pairs in the bibData-dictionary
    keys = list(bibData.keys())  #keys from the list
    random.shuffle(keys)  #randomizes the OCRing

    for key in keys:  #loops through the keys
        bibRecord = bibData[key]  #adds a key to the bibData
        functions.processBibRecord(settings["path_to_memex"],
                                   bibRecord)  #assigns a new parameter
        language = functions.identifyLanguage(
            bibRecord["rCite"],
            "eng")  #identifies a language, assigns the "eng"
        ocrPublication(bibRecord["rCite"], language, int(
            settings["page_limit"]))  #sets a page limit, if there is such

    functions.memexStatusUpdates(settings["path_to_memex"],
                                 ".pdf")  #creates a pdf
    functions.memexStatusUpdates(settings["path_to_memex"],
                                 ".bib")  #creates a bib
    functions.memexStatusUpdates(settings["path_to_memex"],
                                 ".png")  #creates a png
    functions.memexStatusUpdates(settings["path_to_memex"],
                                 ".json")  #creates a jsonfile