Exemple #1
0
def ocrPublication(pathToMemex, citationKey, language):
    # generate and create necessary paths
    # path to the folder
    publPath = functions.generatePublPath(pathToMemex, citationKey)
    # path to the pdf file
    pdfFile  = os.path.join(publPath, citationKey + ".pdf")
    # path to the json file
    jsonFile = os.path.join(publPath, citationKey + ".json") # OCR results will be saved here
    # path to the pages folder
    saveToPath = os.path.join(publPath, "pages") # we will save processed images here

    # generate CLEAN pdf (necessary if you added highlights and comments to your PDFs)
    pdfFileTemp = removeCommentsFromPDF(pdfFile)

    # first we need to check whether this publication has been already processed
    if not os.path.isfile(jsonFile):
        # let's make sure that saveToPath also exists
        if not os.path.exists(saveToPath):
            # create folder pages
            os.makedirs(saveToPath)
        
        # start process images and extract text
        print("\t>>> OCR-ing: %s" % citationKey)

        # create the dictionary textResults
        textResults = {}
        # create the list images 
        images = pdf2image.convert_from_path(pdfFileTemp)
        # length of the list
        pageTotal = len(images)
        # set pageCount
        pageCount = 1
        # loop through the list images
        for image in images:
            image = image.convert('1') # binarizes image, reducing its size
            # create the path for each image file
            finalPath = os.path.join(saveToPath, "%04d.png" % pageCount)
            # save the image
            image.save(finalPath, optimize=True, quality=10)

            # get the text from the image
            text = pytesseract.image_to_string(image, lang=language)
            # save the text to the dictionary textResults
            textResults["%04d" % pageCount] = text

            # write the process to terminal
            print("\t\t%04d/%04d pages" % (pageCount, pageTotal))
            # increase pageCount
            pageCount += 1

        # create the json file
        with open(jsonFile, 'w', encoding='utf8') as f9:
            # write textResults to the file
            json.dump(textResults, f9, sort_keys=True, indent=4, ensure_ascii=False)
    
    else: # in case JSON file already exists
        print("\t>>> %s has already been OCR-ed..." % citationKey)

    # delete the temporary pdf file
    os.remove(pdfFileTemp)
Exemple #2
0
def createIndex(pathToMemex):
    bibData = functions.loadBib(settings["bib_all"])
    with open(settings["template_index"], "r", encoding="utf8") as ft:
        template = ft.read()
    completeList = []
    for k, v in bibData.items():
        path = functions.generatePublPath(memexPath, k)
        entry = "<tr><td><li><a href=" + "@PATHTOPUBL@/pages/DETAILS.html>" + "[@CITEKEY@]</a></td><td> @AUTHOR@</td> <td>(@DATE@)</td> - <td><i>@TITLE@</i></td></li></tr>"
        entry = entry.replace("@PATHTOPUBL@", path)
        entry = entry.replace("@CITEKEY@", k)
        if "author" in v:
            entry = entry.replace("@AUTHOR@", v["author"])
        else:
            entry = entry.replace("@AUTHOR@", "MISSING")
        if "year" in v:
            entry = entry.replace("@DATE@", v["year"])
        else:
            entry = entry.replace("@DATE@", "MISSING")
        if "title" in v:
            entry = entry.replace("@TITLE@", v["title"])
        else:
            entry = entry.replace("@TITLE@", "MISSING")
        completeList.append(entry)
    content = "\n<ul>\n%s\n</ul>" % "\n".join(sorted(completeList))
    content = content.replace("{", "")
    content = content.replace("}", "")
    toc = formatSearches(pathToMemex)

    template = template.replace("@SEARCHES@", toc)
    template = template.replace(
        "@PUBLICATIONS@", publTemplate.replace("@TABLECONTENTS@", content))
    with open(os.path.join(pathToMemex, "searchesInterface.html"),
              "w",
              encoding="utf8") as f9:
        f9.write(template)
Exemple #3
0
def generateMemexStartingPages(pathToMemex):
    # load index template
    with open(settings["template_index"], "r", encoding="utf8") as ft:
        template = ft.read()

    # add index.html
    with open(settings["content_index"], "r", encoding="utf8") as fi:
        indexData = fi.read()
        with open(os.path.join(pathToMemex, "index.html"), "w", encoding="utf8") as f9:
            f9.write(template.replace("@MAINCONTENT@", indexData))

    # load bibliographical data for processing
    publicationDic = {} # key = citationKey; value = recordDic

    for subdir, dirs, files in os.walk(pathToMemex):
        for file in files:
            if file.endswith(".bib"):
                pathWhereBibIs = os.path.join(subdir, file)
                tempDic = functions.loadBib(pathWhereBibIs)
                publicationDic.update(tempDic)

    # generate data for the main CONTENTS
    singleItemTemplate = '<li><a href="@RELATIVEPATH@/pages/DETAILS.html">[@CITATIONKEY@]</a> @AUTHOROREDITOR@ (@DATE@) - <i>@TITLE@</i></li>'
    contentsList = []

    for citeKey,bibRecord in publicationDic.items():
        relativePath = functions.generatePublPath(pathToMemex, citeKey).replace(pathToMemex, "")

        authorOrEditor = "[No data]"
        if "editor" in bibRecord:
            authorOrEditor = bibRecord["editor"]
        if "author" in bibRecord:
            authorOrEditor = bibRecord["author"]
        if "date" in bibRecord:
            date = bibRecord["date"]
        else:
            print("nodate")

        date = bibRecord["date"][:4]

        title = bibRecord["title"]

        # forming a record
        recordToAdd = singleItemTemplate
        recordToAdd = recordToAdd.replace("@RELATIVEPATH@", relativePath)
        recordToAdd = recordToAdd.replace("@CITATIONKEY@", citeKey)
        recordToAdd = recordToAdd.replace("@AUTHOROREDITOR@", authorOrEditor)
        recordToAdd = recordToAdd.replace("@DATE@", date)
        recordToAdd = recordToAdd.replace("@TITLE@", title)

        recordToAdd = recordToAdd.replace("{", "").replace("}", "")

        contentsList.append(recordToAdd)

    contents = "\n<ul>\n%s\n</ul>" % "\n".join(sorted(contentsList))
    mainContent = "<h1>CONTENTS of MEMEX</h1>\n\n" + contents

    # save the CONTENTS page
    with open(os.path.join(pathToMemex, "contents.html"), "w", encoding="utf8") as f9:
        f9.write(template.replace("@MAINCONTENT@", mainContent))
Exemple #4
0
def generateDoclLink(bibTexCode, pageVal, distance):
    pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode)
    bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode))
    bib = bib[bibTexCode]

    author = "N.d."
    if "editor" in bib:
        author = bib["editor"]
    if "author" in bib:
        author = bib["author"]

    reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"])
    search = unicodedata.normalize('NFKD', reference).encode('ascii','ignore')
    search = " <div class='hidden'>%s</div>" % search

    if pageVal == 0: # link to the start of the publication
        htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "DETAILS.html")
        htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink)
        page = ""
        startPage = 0
    else:
        startPage = pageVal - 5
        endPage   = pageVal
        if startPage == 0:
            startPage += 1
        htmlLink = os.path.join(pathToPubl.replace(settings["path_to_memex"], "../../../../"), "pages", "%04d.html" % startPage)
        htmlLink = "<a href='%s'><i>read</i></a>" % (htmlLink)
        page = ", pdfPp. %d-%d</i></a>" % (startPage, endPage)

    publicationInfo = reference + page + search
    publicationInfo = publicationInfo.replace("{", "").replace("}", "")
    singleItemTemplate = '<tr><td>%s</td><td>%f</td><td data-order="%s%05d">%s</td></tr>' % (htmlLink, distance, bibTexCode, startPage, publicationInfo)

    return(singleItemTemplate)
Exemple #5
0
def checkPageNumbers(bib, bibTexCode, startPage):
    page = 0
    if "pages" in bib.keys():
        bibPages = functions.prettifyBib(bib["pages"])
        bibPagesList = list(bibPages.split("--"))
        bibPagesList = [int(i) for i in bibPagesList]

        pathToPubl = functions.generatePublPath(memexPath, bibTexCode)
        jsonFile = os.path.join(pathToPubl, "%s.json" % bibTexCode)

        with open(jsonFile) as jsonData:
            ocred = json.load(jsonData)
            pNumList = ocred.keys()

        if len(pNumList) > (bibPagesList[1] - bibPagesList[0] + 1):
            if startPage == 1:
                page = "TITLE"
            else:
                page = startPage + bibPagesList[0] - 2
        else:
            page = startPage + bibPagesList[0] - 1
    else:
        page = startPage

    return (page)
Exemple #6
0
def processAllClouds(filename):

    docData = json.load(open(filename, "r", encoding="utf8"))

    for k, v in docData.items():
        savePath = functions.generatePublPath(memexPath, k)
        savePath = savePath + "\\" + k
        if v:
            createwordCloud(savePath, k)
def formatPublList(
        pathToMemex
):  #define a function for the formatting of the publications
    ocrFiles = functions.dicOfRelevantFiles(
        pathToMemex,
        settings["ocr_results"])  #take the files with the OCRed pages
    bibFiles = functions.dicOfRelevantFiles(pathToMemex,
                                            ".bib")  #take the bibFiles

    contentsList = []  #create an empty list

    for key, value in ocrFiles.items():  #loop through the OCRed pages
        if key in bibFiles:  #search for the key in the bibFile
            bibRecord = functions.loadBib(
                bibFiles[key])  #load the bibliographical data for this item
            bibRecord = bibRecord[key]  #take the key

            relativePath = functions.generatePublPath(
                pathToMemex,
                key).replace(pathToMemex,
                             "")  #take the relative path to the publication

            authorOrEditor = "[No data]"  #take no information on the author as default setting
            if "editor" in bibRecord:  #check if there is information about the editor
                authorOrEditor = bibRecord["editor"]  #insert it
            if "author" in bibRecord:  #check if there is information about the author
                authorOrEditor = bibRecord["author"]  #insert it

            date = bibRecord["year"][:4]  #insert the year of the publication
            title = bibRecord["title"]  #insert the title

            # formatting template
            citeKey = '<div class="ID">[%s]</div>' % key  #take the citeKey
            publication = '%s (%s) <i>%s</i>' % (
                authorOrEditor, date, title
            )  #take the information about the publication and format it
            search = unicodedata.normalize('NFKD', publication).encode(
                'ascii', 'ignore'
            )  #replace diacritical characters with their ascii equivalents
            publication += " <div class='hidden'>%s</div>" % search  #repeat the information and hide it
            link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath  #add the link to the details page of each publication

            singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % (
                link, citeKey, publication
            )  #collect the information in a single template
            recordToAdd = singleItemTemplate.replace("{", "").replace(
                "}", "")  #remove curly brackets

            contentsList.append(
                recordToAdd)  #add the single records to the content list

    contents = "\n".join(sorted(contentsList))  #join the sorted content list
    final = publicationsTemplate.replace(
        "@TABLECONTENTS@", contents
    )  #replace the wildcard in the template with the actual content
    return (final)  #return this variable
Exemple #8
0
def ocrPublication(
    pathToMemex, citationKey, language
):  ## ocr function takes path, citationkey and language as argument
    publPath = functions.generatePublPath(
        pathToMemex, citationKey
    )  ## generates path that gets us to the file with the citekey name
    pdfFile = os.path.join(publPath, citationKey + ".pdf")  ## generates pdf
    jsonFile = os.path.join(publPath,
                            citationKey + ".json")  ## generates json file
    saveToPath = os.path.join(
        publPath, "pages")  ## creates new folder for all the ocr-ed pages

    if not os.path.isfile(
            jsonFile
    ):  ## checks if there is a json file to see if it has been ocr-ed already
        if not os.path.exists(
                saveToPath):  ## if not it makes one and starts the process
            os.makedirs(saveToPath)

        print(
            "\t>>> OCR-ing: %s" % citationKey
        )  ## shows us that it is ocr-ing the pdf and the citationkey of that one

        textResults = {}  ## creates dictionary for results
        images = pdf2image.convert_from_path(
            pdfFile)  ## creates the images of the single pages in the pdf
        pageTotal = len(
            images
        )  ## to know how many pages have been processed; always adds 1
        pageCount = 1
        for image in images:  ## loops through the images
            text = pytesseract.image_to_string(
                image,
                lang=language)  ## analyses the string with the given language
            textResults["%04d" % pageCount] = text

            image = image.convert('1')  # binarizes image, reducing its size
            finalPath = os.path.join(
                saveToPath,
                "%04d.png" % pageCount)  ## saves the pages into pages folder
            image.save(finalPath, optimize=True, quality=10)

            print("\t\t%04d/%04d pages" % (pageCount, pageTotal))
            pageCount += 1

        with open(jsonFile, 'w', encoding='utf8') as f9:
            json.dump(textResults,
                      f9,
                      sort_keys=True,
                      indent=4,
                      ensure_ascii=False)  ## dumps results into json file

    else:
        print("\t>>> %s has already been OCR-ed..." % citationKey
              )  ## if it finds the json file in the beginning it prints this
Exemple #9
0
def processAllclouds(filename):

    docData = json.load(open(filename, "r",
                             encoding="utf8"))  ## loads tfidf file

    for k, v in docData.items():  ###loop through the file
        savePath = functions.generatePublPath(memexPath,
                                              k)  ##create path for file
        savePath = savePath + "\\" + k
        if v:
            createWordCloud(savePath, v)  ### create wordcloud
Exemple #10
0
def ocrPublication(citationKey, language, pageLimit):
    # generate and create necessary paths
    publPath = functions.generatePublPath(settings["path_to_memex"],
                                          citationKey)
    pdfFile = os.path.join(publPath, citationKey + ".pdf")
    jsonFile = os.path.join(publPath, citationKey +
                            ".json")  # OCR results will be saved here
    saveToPath = os.path.join(publPath,
                              "pages")  # we will save processed images here

    # first we need to check whether this publication has been already processed
    if not os.path.isfile(jsonFile):
        # let's make sure that saveToPath also exists
        if not os.path.exists(saveToPath):
            os.makedirs(saveToPath)

        # start process images and extract text
        print("\t>>> OCR-ing: %s" % citationKey)

        textResults = {}
        images = pdf2image.convert_from_path(pdfFile)
        pageTotal = len(images)
        pageCount = 1
        if pageTotal <= int(pageLimit):
            for image in images:
                text = pytesseract.image_to_string(image, lang=language)
                textResults["%04d" % pageCount] = text

                image = image.convert(
                    '1')  # binarizes image, reducing its size
                finalPath = os.path.join(saveToPath, "%04d.png" % pageCount)
                image.save(finalPath, optimize=True, quality=10)

                print("\t\t%04d/%04d pages" % (pageCount, pageTotal))
                pageCount += 1

            with open(jsonFile, 'w', encoding='utf8') as f9:
                json.dump(textResults,
                          f9,
                          sort_keys=True,
                          indent=4,
                          ensure_ascii=False)
        else:
            print(
                "\t%d: the length of the publication exceeds current limit (%d)"
                % (pageTotal, pageLimit))
            print(
                "\tIncrease `page_limit` in settings to process this publication."
            )

    else:  # in case JSON file already exists
        print("\t>>> %s has already been OCR-ed..." % citationKey)
Exemple #11
0
def generateReferenceSimple(bibTexCode):
    pathToPubl = functions.generatePublPath(settings["path_to_memex"], bibTexCode)
    bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode))
    bib = bib[bibTexCode]

    author = "N.d."
    if "editor" in bib:
        author = bib["editor"]
    if "author" in bib:
        author = bib["author"]

    reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"])
    reference = reference.replace("{", "").replace("}", "")
    return(reference)
Exemple #12
0
def genConnectedTexts(citeKey):
    similarities = json.load(
        open("cosineTableDic_filtered.txt", "r", encoding="utf8"))
    contentTemp = "<tr><td><i><a href='@link@'>read</a></i></td><td>@Sim@</td><td>@Publication@</td></tr>"

    if similarities:
        temp = similarities[citeKey]
        content = ""

        for k, v in temp.items():
            content = content + contentTemp.replace("@Publication@", k)
            content = content.replace("@Sim@", str(v))
            link = "..\\..\\..\\..\\." + functions.generatePublPath(
                memexPath, k) + "\\pages\\DETAILS.html"
            content = content.replace("@link@", link)
    return (content)
Exemple #13
0
def formatPublList(pathToMemex):
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex,
                                            settings["ocr_results"])
    bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib")

    contentsList = []

    for key, value in ocrFiles.items():
        if key in bibFiles:
            bibRecord = functions.loadBib(bibFiles[key])
            bibRecord = bibRecord[key]

            relativePath = functions.generatePublPath(pathToMemex,
                                                      key).replace(
                                                          pathToMemex, "")

            authorOrEditor = "[No data]"
            if "editor" in bibRecord:
                authorOrEditor = bibRecord["editor"]
            if "author" in bibRecord:
                authorOrEditor = bibRecord["author"]

            date = "nodate"
            if "year" in bibRecord:
                date = bibRecord["year"]

            title = bibRecord["title"]

            # formatting template
            citeKey = '<div class="ID">[%s]</div>' % key
            publication = '%s (%s) <i>%s</i>' % (authorOrEditor, date, title)
            search = unicodedata.normalize('NFKD', publication).encode(
                'ascii', 'ignore')
            publication += " <div class='hidden'>%s</div>" % search
            link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath

            singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % (
                link, citeKey, publication)
            recordToAdd = singleItemTemplate.replace("{", "").replace("}", "")

            contentsList.append(recordToAdd)

    contents = "\n".join(sorted(contentsList))
    final = publicationsTemplate.replace("@TABLECONTENTS@", contents)

    return (final)
Exemple #14
0
def generateDoclLink(bibTexCode, pageVal, distance):
    pathToPubl = functions.generatePublPath(memexPath, bibTexCode)
    bib = functions.loadBib(os.path.join(pathToPubl, "%s.bib" % bibTexCode))
    bib = bib[bibTexCode]

    author = "N.d."
    if "editor" in bib:
        author = bib["editor"]
    if "author" in bib:
        author = bib["author"]

    reference = "%s (%s). <i>%s</i>" % (author, bib["date"][:4], bib["title"])
    search = unicodedata.normalize('NFKD', reference).encode('ascii', 'ignore')
    search = " <div class='hidden'>%s</div>" % search

    if pageVal == 0:  # link to the start of the publication
        htmlLink = os.path.join(pathToPubl.replace(memexPath, "../../../../"),
                                "pages", "DETAILS.html")
        htmlLink = "<a href='{0}'>[{1}]</a>".format(htmlLink, bibTexCode)
        page = ""
        startPage = 0
    else:
        startPage = pageVal - 5
        endPage = pageVal
        if startPage == 0:
            startPage += 1

        realStartPage = checkPageNumbers(bib, bibTexCode, startPage)
        realEndPage = checkPageNumbers(bib, bibTexCode, endPage)

        htmlLink = os.path.join(pathToPubl.replace(memexPath, "../../../../"),
                                "pages", "%04d.html" % startPage)
        htmlLink = "<a href='{0}'>[{1},{2}]</a>".format(
            htmlLink, bibTexCode, realStartPage)
        page = ", pp. {0}-{1}</i></a>".format(realStartPage, realEndPage)

    publicationInfo = reference + page + search
    publicationInfo = publicationInfo.replace("{", "").replace("}", "")
    singleItemTemplate = '<tr><td data-order="{1}{2:05d}"><div class="ID">{3}</div> {4}</td><td>{0:f}</td></tr>'.format(
        distance, bibTexCode, startPage, htmlLink, publicationInfo)

    return (singleItemTemplate)