def formatPublList(
        pathToMemex
):  #define a function for the formatting of the publications
    ocrFiles = functions.dicOfRelevantFiles(
        pathToMemex,
        settings["ocr_results"])  #take the files with the OCRed pages
    bibFiles = functions.dicOfRelevantFiles(pathToMemex,
                                            ".bib")  #take the bibFiles

    contentsList = []  #create an empty list

    for key, value in ocrFiles.items():  #loop through the OCRed pages
        if key in bibFiles:  #search for the key in the bibFile
            bibRecord = functions.loadBib(
                bibFiles[key])  #load the bibliographical data for this item
            bibRecord = bibRecord[key]  #take the key

            relativePath = functions.generatePublPath(
                pathToMemex,
                key).replace(pathToMemex,
                             "")  #take the relative path to the publication

            authorOrEditor = "[No data]"  #take no information on the author as default setting
            if "editor" in bibRecord:  #check if there is information about the editor
                authorOrEditor = bibRecord["editor"]  #insert it
            if "author" in bibRecord:  #check if there is information about the author
                authorOrEditor = bibRecord["author"]  #insert it

            date = bibRecord["year"][:4]  #insert the year of the publication
            title = bibRecord["title"]  #insert the title

            # formatting template
            citeKey = '<div class="ID">[%s]</div>' % key  #take the citeKey
            publication = '%s (%s) <i>%s</i>' % (
                authorOrEditor, date, title
            )  #take the information about the publication and format it
            search = unicodedata.normalize('NFKD', publication).encode(
                'ascii', 'ignore'
            )  #replace diacritical characters with their ascii equivalents
            publication += " <div class='hidden'>%s</div>" % search  #repeat the information and hide it
            link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath  #add the link to the details page of each publication

            singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % (
                link, citeKey, publication
            )  #collect the information in a single template
            recordToAdd = singleItemTemplate.replace("{", "").replace(
                "}", "")  #remove curly brackets

            contentsList.append(
                recordToAdd)  #add the single records to the content list

    contents = "\n".join(sorted(contentsList))  #join the sorted content list
    final = publicationsTemplate.replace(
        "@TABLECONTENTS@", contents
    )  #replace the wildcard in the template with the actual content
    return (final)  #return this variable
Beispiel #2
0
def generateContentsList():
    relDic = functions.dicOfRelevantFiles(memexPath, "bib")
    contentsList = []

    for k, v in relDic.items():
        k = k[:-1]
        bibDic = functions.loadBib(v)

        authorOrEditor = "[No data]"
        if "editor" in bibDic[k]:
            authorOrEditor = bibDic[k]["editor"]
        if "author" in bibDic[k]:
            authorOrEditor = bibDic[k]["author"]

        publication = "{0} ({1}) <i>{2}</i>".format(authorOrEditor,
                                                    bibDic[k]["date"],
                                                    bibDic[k]["title"])
        search = unicodedata.normalize('NFKD',
                                       publication).encode('ascii', 'ignore')
        publication += " <div class=\"hidden\">{0}</div>".format(search)
        contentsList.append(
            "<tr><td><div class=\"ID\"><a href=\"{0}/pages/DETAILS.html\">[{1}]</a></div> {2}</td></tr>"
            .format(os.path.join(k[0], k[:2], k), k, publication))

    contentsListSorted = sorted(contentsList)
    contentsList = "".join(contentsListSorted)
    mainElement = publicationsTemplate.replace("@TABLECONTENTS@", contentsList)
    return (mainElement)
Beispiel #3
0
def processAll(path_to_memex):
    pathData = functions.dicOfRelevantFiles(memexPath, ".bib")
    print(pathData)
    #bibData = functions.loadBib(settings["bib_all"])
   
    for k, v in pathData.items():
        generatePublicationInterface(k, v)
Beispiel #4
0
def search():
    

    ## load OCR results
    ocrFiles = functions.dicOfRelevantFiles(memexPath, ".json")   
    citeKeys = list(ocrFiles.keys())

    #word = input("Please enter a word:" )

    dicOfMatches = {}       # dictionary with citeKeys as value, matches as value
    
    ## loop through OCR results

    for citeKeys, word in ocrFiles.items():   
        val = json.load(open(ocrFiles[citeKeys],"r",encoding= "utf8")) 
        #print(val) val = dictionary with key: pagenumber and value: pagecontent
        
        for pagenumbers, pagecontent in val: 
            dicOfMatches = {}
            for k in v:
                if "christianity" in v:
                    print("yes")
                else: 
                     print("notinthepage")
        
    print (dicOfMatches)
def searchOCRresults(pathToMemex, searchString):
    print("SEARCHING FOR: `%s`" % searchString) #to keep track of what we are doing 
    files = functions.dicOfRelevantFiles(pathToMemex, ".json") #takes every file with OCR results
    results = {}                      ## create dictionary
    for citationKey, pathToJSON in files.items(): # loop through dictionary; all the files
        data = json.load(open(pathToJSON)) ## load results
        #print(citationKey)
        count = 0
        for pageNumber, pageText in data.items(): #loop for specific file in there
            if re.search(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE): # search; flags for re to ignore case
                if citationKey not in results:
                    results[citationKey] = {}
                # relative path
                a = citationKey.lower()
                relPath = os.path.join(a[:1], a[:2], citationKey, "pages", "%s.html" % pageNumber) # page to html.page, shows specific page in dictionary - klick on link to get to page
                countM = len(re.findall(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE)) # count how many matches are on the page; re.findall command; ignore case for better results
                pageWithHighlights = re.sub(r"\b(%s)\b" % searchString, r"<span class='searchResult'>\1</span>", pageText, flags=re.IGNORECASE)#take our page and wrap the match into html, assign a class, add into css file
                results[citationKey][pageNumber] = {} # create empty dic for each page
                results[citationKey][pageNumber]["pathToPage"] = relPath # add path to page
                results[citationKey][pageNumber]["matches"] = countM # number of matches
                results[citationKey][pageNumber]["result"] = pageWithHighlights.replace("\n", "<br>") # and formated page as results
                count  += 1 #count of number of searches that we run; can be done within the loop
        if count > 0: # reformate the results - thats why we want the count; not necessary - only to help with organizing search results
            print("\t", citationKey, " : ", count) # keep track of what is going on
            newKey = "%09d::::%s" % (count, citationKey) #creating a new key for each publication, combines frequency and citationkey
            results[newKey] = results.pop(citationKey) # removes an item from the dictionary - pop; command removes the old item and creates a new one at the same time
            # add time stamp; get datetime library 
            currentTime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')# creates variable - formate the time in the red; string
            results["timestamp"] = currentTime #add extra item to dictionary; the timestampt with the actual time
            # add search string (as submitted)
            results["searchString"] = searchString # add searchstring to dictionary
    saveWith = re.sub("\W+", "", searchString) ## save the results, take the searchstring and remove all word characters
    saveTo = os.path.join(pathToMemex, "searches", "%s.searchResults" % saveWith) # create save path, putting search results in subfolder, assign extension - unique so that it does not exist already
    with open(saveTo, 'w', encoding='utf8') as f9c: #save results
        json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False) # json dump because it is a dictionary
Beispiel #6
0
def processAllRecords():
    # call the function dicOfRelevantFiles with memexPath as input value and save the return value to relDic
    relDic = functions.dicOfRelevantFiles(memexPath, "bib")
    # loop through all items of the dictionry relDic
    for k, v in relDic.items():
        # call the function generatePublicationInterface with k (removing the last character) and v as input values
        generatePublicationInterface(k[:-1], v)
Beispiel #7
0
def search(searchArgument):
    targetFiles = functions.dicOfRelevantFiles(memexPath, ".json")
    citeKeys = list(targetFiles.keys())

    #searchArgument = input("What are you looking for: ")

    results = {}
    for citeKey in citeKeys:  #loop trough all the keys
        docData = json.load(
            open(targetFiles[citeKey], "r", encoding="utf8"
                 ))  #load the respective json file with the ocr results
        for k, v in docData.items():  #keys = page numbers values = text
            if searchArgument in v:  #if the search Argument is in the page
                matchCounter = len(re.findall(searchArgument,
                                              v))  #count how often
                if not citeKey in results.keys(
                ):  #creates an empty dict only if there isnt allready one
                    results[citeKey] = {}
                results[citeKey][k] = {
                }  #creates sub-dict with the page number as key
                results[citeKey][k][
                    "matches"] = matchCounter  #at the key matches the number of matches
                pagePath = os.path.join(
                    functions.generatePublPath(memexPath,
                                               citeKey), "pages\\", k +
                    ".html")  #creates the path to the html file for the page
                results[citeKey][k]["pathToPage"] = pagePath
                results[citeKey][k][
                    "result"] = v  #adds the ocred text to the dict
    with open("search.txt", 'w',
              encoding='utf8') as f9:  #saves it into a file too
        json.dump(results, f9, sort_keys=True, indent=4, ensure_ascii=False)

    return (results)
Beispiel #8
0
def formatPublList(pathToMemex):
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex,
                                            settings["ocr_results"])
    bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib")

    contentsList = []

    for key, value in ocrFiles.items():
        if key in bibFiles:
            bibRecord = functions.loadBib(bibFiles[key])
            bibRecord = bibRecord[key]

            relativePath = functions.generatePublPath(pathToMemex,
                                                      key).replace(
                                                          pathToMemex, "")

            authorOrEditor = "[No data]"
            if "editor" in bibRecord:
                authorOrEditor = bibRecord["editor"]
            if "author" in bibRecord:
                authorOrEditor = bibRecord["author"]

            date = "nodate"
            if "year" in bibRecord:
                date = bibRecord["year"]

            title = bibRecord["title"]

            # formatting template
            citeKey = '<div class="ID">[%s]</div>' % key
            publication = '%s (%s) <i>%s</i>' % (authorOrEditor, date, title)
            search = unicodedata.normalize('NFKD', publication).encode(
                'ascii', 'ignore')
            publication += " <div class='hidden'>%s</div>" % search
            link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath

            singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % (
                link, citeKey, publication)
            recordToAdd = singleItemTemplate.replace("{", "").replace("}", "")

            contentsList.append(recordToAdd)

    contents = "\n".join(sorted(contentsList))
    final = publicationsTemplate.replace("@TABLECONTENTS@", contents)

    return (final)
def processAllRecords(pathToMemex):  #defines the process all records function
    files = functions.dicOfRelevantFiles(pathToMemex,
                                         ".bib")  #takes the bibFiles
    for citeKey, pathToBibFile in files.items():  #loops through them
        if os.path.exists(
                pathToBibFile.replace(".bib", ".json")
        ):  #search for the files with .bib and .json extentions
            generatePublicationInterface(citeKey,
                                         pathToBibFile)  #starts the function
Beispiel #10
0
def formatSearches(pathToMemex):
    with open(settings["template_search"], "r", encoding="utf8") as f1:
        indexTmpl = f1.read()
    dof = functions.dicOfRelevantFiles(
        pathToMemex, ".searchResults"
    )  #returns a dict of links with all files with the .searchResults ending
    # format individual search pages
    toc = []
    for file, pathToFile in dof.items():
        searchResults = []
        data = json.load(open((pathToFile), "r", encoding="utf8"))
        # collect toc
        template = "<tr> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td></tr>"

        # variables
        linkToSearch = os.path.join("searches", file + ".html")
        pathToPage = '<a href="%s"><i>read</i></a>' % linkToSearch
        searchString = '<div class="searchString">%s</div>' % data.pop(
            "searchString")
        timeStamp = data.pop("timestamp")
        tocItem = template % (pathToPage, searchString, len(data), timeStamp)
        toc.append(tocItem)

        # generate the results page
        keys = sorted(data.keys(), reverse=True)
        for k in keys:
            searchResSingle = []
            results = data[k]
            temp = k.split("::::")
            header = "%s (pages with results: %d)" % (temp[1], int(temp[0]))
            #print(header)
            for page, excerpt in results.items():
                #print(excerpt["result"])
                pdfPage = int(page)
                linkToPage = '<a href="../%s"><i>go to the original page...</i></a>' % excerpt[
                    "pathToPage"]
                searchResSingle.append(
                    "<li><b><hr>(pdfPage: %d)</b><hr> %s <hr> %s </li>" %
                    (pdfPage, excerpt["result"], linkToPage))
            searchResSingle = "<ul>\n%s\n</ul>" % "\n".join(searchResSingle)
            searchResSingle = generalTemplate.replace("@ELEMENTHEADER@",
                                                      header).replace(
                                                          "@ELEMENTCONTENT@",
                                                          searchResSingle)
            searchResults.append(searchResSingle)

        searchResults = "<h2>SEARCH RESULTS FOR: <i>%s</i></h2>\n\n" % searchString + "\n\n".join(
            searchResults)
        with open(pathToFile.replace(".searchResults", ".html"),
                  "w",
                  encoding="utf8") as f9:
            f9.write(indexTmpl.replace("@MAINCONTENT@", searchResults))
        #os.remove(pathToFile)

    #input("\n".join(toc))
    toc = searchesTemplate.replace("@TABLECONTENTS@", "\n".join(toc))
    return (toc)
Beispiel #11
0
def searchOCRresults(pathToMemex, searchString):
    print("SEARCHING FOR: `%s`" % searchString)
    # run the function (from previous step) that creates a dictionary "files" of the paths to the json files. Create an empty dictionary.
    files = functions.dicOfRelevantFiles(pathToMemex, ".json")
    results = {}
    # run a loop for each item of the dictionary "files", so that a json file is created from each path
    for citationKey, pathToJSON in files.items():
        data = json.load(open(pathToJSON))
        #print(citationKey)
        count = 0
        # for each page, i.e. key "page number" in the json file, search for matches for the search string and save in a dictionary by the citation key and page number
        for pageNumber, pageText in data.items():
            if re.search(r"\b%s\b" % searchString,
                         pageText,
                         flags=re.IGNORECASE):
                if citationKey not in results:
                    results[citationKey] = {}

                # relative path
                a = citationKey.lower()
                relPath = os.path.join(a[:1], a[:2], citationKey, "pages",
                                       "%s.html" % pageNumber)
                countM = len(
                    re.findall(r"\b%s\b" % searchString,
                               pageText,
                               flags=re.IGNORECASE))
                pageWithHighlights = re.sub(
                    r"\b(%s)\b" % searchString,
                    r"<span class='searchResult'>\1</span>",
                    pageText,
                    flags=re.IGNORECASE)

                results[citationKey][pageNumber] = {}
                results[citationKey][pageNumber]["pathToPage"] = relPath
                results[citationKey][pageNumber]["matches"] = countM
                results[citationKey][pageNumber][
                    "result"] = pageWithHighlights.replace("\n", "<br>")

                count += 1

        if count > 0:
            print("\t", citationKey, " : ", count)
            newKey = "%09d::::%s" % (count, citationKey)
            results[newKey] = results.pop(citationKey)

            # add time stamp
            currentTime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            results["timestamp"] = currentTime
            # add search string (as submitted)
            results["searchString"] = searchString
# save the search results in a 'nonsensical' path, then save them as json files
    saveWith = re.sub("\W+", "", searchString)
    saveTo = os.path.join(pathToMemex, "searches",
                          "%s.searchResults" % saveWith)
    with open(saveTo, 'w', encoding='utf8') as f9c:
        json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False)
def searchOCRresults(pathToResults, searchString):
    print("SEARCHING FOR: `%s`" % searchString)
    files = functions.dicOfRelevantFiles(pathToMemex, ".json")
    results = {}

    for citationKey, pathToJSON in files.items():
        data = json.load(open(pathToJSON, "r", encoding="utf8"))
        #docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8"))
        #print(citationKey)
        count = 0

        for pageNumber, pageText in data.items():
            if re.search(r"\b%s\b" % searchString,
                         pageText,
                         flags=re.IGNORECASE):
                if citationKey not in results:
                    results[citationKey] = {}

                # relative path
                a = citationKey.lower()
                relPath = os.path.join(a[:1], a[:2], citationKey, "pages",
                                       "%s.html" % pageNumber)
                countM = len(
                    re.findall(r"\b%s\b" % searchString,
                               pageText,
                               flags=re.IGNORECASE))
                pageWithHighlights = re.sub(
                    r"\b(%s)\b" % searchString,
                    r"<span class='searchResult'>\1</span>",
                    pageText,
                    flags=re.IGNORECASE)

                results[citationKey][pageNumber] = {}
                results[citationKey][pageNumber]["pathToPage"] = relPath
                results[citationKey][pageNumber]["matches"] = countM
                results[citationKey][pageNumber][
                    "result"] = pageWithHighlights.replace("\n", "<br>")

                count += 1

        if count > 0:
            print("\t", citationKey, " : ", count)
            newKey = "%09d::::%s" % (count, citationKey)
            results[newKey] = results.pop(citationKey)

            # add time stamp
            currentTime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            results["timestamp"] = currentTime
            # add search string (as submitted)
            results["searchString"] = searchString

    saveWith = re.sub("\W+", "", searchString)
    saveTo = os.path.join(pathToResults, "%s.searchResults" % saveWith)
    print(saveTo)
    with open(saveTo, 'w', encoding='utf8') as f9c:
        json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False)
Beispiel #13
0
def processAllRecords(
    pathToMemex
):  # Last function processes all of the records, uses pathToMemex as an argument
    files = functions.dicOfRelevantFiles(
        pathToMemex,
        ".bib")  # Gets all of the bib files and loops through the dictionary
    for citekey, pathToBibFile in files.items(
    ):  # The citation key and the pathToBibFile are given
        #print(citeKey)
        generatePublicationInterface(citeKey, pathToBibFile)
    generateMemexStartingPages(pathToMemex)  # Generates the starting pages.
Beispiel #14
0
def search():
    ## load OCR results
    ocrFiles = functions.dicOfRelevantFiles(memexPath, ".json")   
    citeKeys = list(ocrFiles.keys())
    word = input("Please enter a word:" )
    dicOfMatches = {}       # dictionary with citeKeys as value, matches as value
    print(ocrFiles)
    for citeKeys, word in ocrFiles.items():   
        val = json.load(open(ocrFiles[citeKeys],"r",encoding= "utf8")) # load each json file
        dicOfPages = {}
        pagenumbers = list(val.keys())
        pagetext= list(val)

        #dicOfPages[keys] = pagenumbers # didn't work
        #dicOfPages[val] = pagetext

        print(dicOfPages)
Beispiel #15
0
def formatSearches(pathToMemex): #defines function to format the searches
    with open(settings["template_search"], "r", encoding="utf8") as f1: #opens from "teplate_search"
        indexTmpl = f1.read() #opens the searchTemplate
    dof = functions.dicOfRelevantFiles(pathToMemex, ".searchResults") #chooses the files with the search results
    # format individual search pages
    toc = [] #creates an empty list
    for file, pathToFile in dof.items(): #loops through all the files with searches
        searchResults = [] #creates an empty list
        data = json.load(open(pathToFile, "r", encoding="utf8")) #loads the files with the search results
        # collect toc
        template = "<tr> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td></tr>" #creates the format of the table

        # variables
        linkToSearch = os.path.join("searches", file+".html") #adds the link to searches with .html
        pathToPage = '<a href="%s"><i>read</i></a>' % linkToSearch #creates the link in the table to the html-file with our search results
        searchString = '<div class="searchString">%s</div>' % data.pop("searchString") #take the searchstring from the files with our search results 
        timeStamp = data.pop("timestamp") #takes the timestamp
        tocItem = template % (pathToPage, searchString, len(data), timeStamp) #adds the variables to the template
        toc.append(tocItem) #adds the template to the table of contents

        # generate the results page
        keys = sorted(data.keys(), reverse=True) #sorts the citation keys with the number of pages, results in reverse order
        for k in keys: #loops through citation keys
            searchResSingle = [] #create an empty list
            results = data[k] #creates an empty list
            temp = k.split("::::") #splits the citation keys and the number of pages with results
            header = "%s (pages with results: %d)" % (temp[1], int(temp[0])) #creates a header for each publication with citation key and the number of pages with results
            #print(header)
            for page, excerpt in results.items(): #loops through the results
                #print(excerpt["result"])
                pdfPage = int(page) #takes the page with the searchstring
                linkToPage = '<a href="../%s"><i>go to the original page...</i></a>' % excerpt["pathToPage"] #adds a link to the original page with the search result
                searchResSingle.append("<li><b><hr>(pdfPage: %d)</b><hr> %s <hr> %s </li>" % (pdfPage, excerpt["result"], linkToPage)) #adds the text and the link to the list
            searchResSingle = "<ul>\n%s\n</ul>" % "\n".join(searchResSingle) #joins the single pages together
            searchResSingle = generalTemplate.replace("@ELEMENTHEADER@", header).replace("@ELEMENTCONTENT@", searchResSingle) #replaces the wildcards in the headers
            searchResults.append(searchResSingle) #appends the results of the search
        
        searchResults = "<h2>SEARCH RESULTS FOR: <i>%s</i></h2>\n\n" % searchString + "\n\n".join(searchResults) #creates a header for the html-page and join the search results
        with open(pathToFile.replace(".searchResults", ".html"), "w", encoding="utf8") as f9:
            f9.write(indexTmpl.replace("@MAINCONTENT@", searchResults)) #creates the html-page
        #os.remove(pathToFile)
        
    #input("\n".join(toc))
    toc = searchesTemplate.replace("@TABLECONTENTS@", "\n".join(toc)) #replaces the wildcard in the table of contents
    return(toc) #returns it
Beispiel #16
0
def searchOCRresults(pathToMemex, searchString):
    print("SEARCHING FOR: `%s`" % searchString)
    files = functions.dicOfRelevantFiles(pathToMemex, ".json")  #returns us a dirct with all the json file, citeKey as KEy and the paths as values
    results = {}

    for citationKey, pathToJSON in files.items():          #loop through all of them    
        data = json.load(open(pathToJSON))                 #load the current json file ->OCRed Text
        #print(citationKey)
        count = 0                                           #count

        for pageNumber, pageText in data.items():           #page Number as key, Text as value
            if re.search(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE):  #search each page for the searchString
                if citationKey not in results:
                    results[citationKey] = {}               #if in the results dict is no entry allready create and empyty sub dict with the citekey as key

                # relative path
                a = citationKey.lower()                     #save citekey
                relPath = os.path.join(a[:1], a[:2], citationKey, "pages", "%s.html" % pageNumber)  #create path for the html page with the page number with a match
                countM = len(re.findall(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE))   #count all matches in the page
                pageWithHighlights = re.sub(r"\b(%s)\b" % searchString, r"<span class='searchResult'>\1</span>", pageText, flags=re.IGNORECASE) #highlight the searchstring in the results

                results[citationKey][pageNumber] = {}   #create empty dict with the page number as key - all other data will fo into this
                results[citationKey][pageNumber]["pathToPage"] = relPath    #add the path
                results[citationKey][pageNumber]["matches"] = countM        #add the count
                results[citationKey][pageNumber]["result"] = pageWithHighlights.replace("\n", "<br>")   #add the text

                count  += 1 #count pages with results up

        if count > 0:   #if there are results
            print("\t", citationKey, " : ", count)  #print how many at this page
            newKey = "%09d::::%s" % (count, citationKey)    
            results[newKey] = results.pop(citationKey)  #add the number of matches to the citekey

            # add time stamp
            currentTime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')  #current time
            results["timestamp"] = currentTime      #added as a timestamp
            # add search string (as submitted)
            results["searchString"] = searchString  #add search string

    saveWith = re.sub("\W+", "", searchString)  
    saveTo = os.path.join(pathToMemex, "searches", "%s.searchResults" % saveWith)
    with open(saveTo, 'w', encoding='utf8') as f9c: #save the search with a recognizeable name
        json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False)
Beispiel #17
0
def processAllRecordsSTR(pathToMemex):
    files = functions.dicOfRelevantFiles(pathToMemex, ".bib")
    citeKeys = list(files.keys())
    random.shuffle(citeKeys)

    for citeKey in citeKeys:
        print(citeKey)
        bibData = functions.loadBib(files[citeKey])
        if "pagetotal" in bibData:
            pageTotal = int(bibData["pagetotal"])
            if pageTotal <= int(settings["page_limit"]):
                language = functions.identifyLanguage(bibData[citeKey], "eng")
                ocrPublication(citeKey, language, settings["page_limit"])
        else:
            language = functions.identifyLanguage(bibData[citeKey], "eng")
            ocrPublication(citeKey, language, settings["page_limit"])

    functions.memexStatusUpdates(settings["path_to_memex"], ".pdf")
    functions.memexStatusUpdates(settings["path_to_memex"], ".bib")
    functions.memexStatusUpdates(settings["path_to_memex"], ".png")
    functions.memexStatusUpdates(settings["path_to_memex"], ".json")
Beispiel #18
0
def generateSearchList():
    searchFiles = functions.dicOfRelevantFiles(memexPath, "searchResults")
    queryKeys = sorted(list(searchFiles.keys()))
    searchList = []

    for queryKey in queryKeys:
        docData = json.load(open(searchFiles[queryKey]))
        searchList.append(
            "<tr><td><div class=\"searchString\"><a href=\"search/{0}.html\">{1}</a></div></td><td>{2}</td><td>{3}</td></tr>"
            .format(re.sub(r"\W+", "", docData["searchString"]),
                    docData["searchString"],
                    len(docData) - 2, docData["timestamp"]))

    searchListSorted = sorted(searchList)
    searchList = "".join(searchListSorted)

    mainElement = searchesTemplate.replace("@TABLECONTENTS@", searchList)

    createSearchResultPages()

    return (mainElement)
Beispiel #19
0
def createSearchResultPages():
    with open(settings["template_search"], "r", encoding="utf8") as ft:
        template = ft.read()
    dof = functions.dicOfRelevantFiles(memexPath, ".searchResults")

    for file, pathToFile in dof.items():
        data = json.load(open(pathToFile))
        contentsList = []
        searchString = data["searchString"]
        data.pop("timestamp")
        data.pop("searchString")
        keys = sorted(data.keys(), reverse=True)

        for citekey in keys:
            recordToAdd = generalTemplate
            temp = citekey.split("::::")
            buttonHeader = '<b>{0}</b> (pages with results: {1})'.format(
                temp[1], int(temp[0]))
            recordToAdd = recordToAdd.replace("@ELEMENTHEADER@", buttonHeader)
            linkList = []
            pages = data[citekey]

            for page, results in pages.items():
                itemToAdd = '<li><hr><b>(pdfPage: {0})</b><hr>{1}<hr> <a href="../{2}"><i>go to the original page...</i></a></li>'.format(
                    page, results["result"], results["pathToPage"])
                linkList.append(itemToAdd)

            listContent = "\n<ul>\n%s\n</ul>\n" % "\n".join(linkList)
            recordToAdd = recordToAdd.replace("@ELEMENTCONTENT@", listContent)
            contentsList.append(recordToAdd)

            contents = "".join(contentsList)
            mainContent = "<h1>SEARCH RESULTS FOR: <i><div class='searchString'>" + searchString + "</div></i></h1>\n\n" + contents

            saveWith = re.sub(r"\W+", "", searchString)
            directory = os.path.join(memexPath, "search", saveWith + ".html")
            with open(directory, "w", encoding="utf8") as f9:
                f9.write(template.replace("@MAINCONTENT@", mainContent))
Beispiel #20
0
def generateContentsPage():
    # load contents template
    with open(settings["template_contents"], "r", encoding="utf8") as ft:
        template = ft.read()

    # call the function dicOfRelevantFiles with memexPath as input value and save the return value to relDic
    relDic = functions.dicOfRelevantFiles(memexPath, "bib")
    # create the list linkList
    linkList = []

    # loop through all items of the dictionry relDic
    for k, v in relDic.items():
        # removing the last character
        k = k[:-1]
        # call the function loadBib with v as input value and save the return value to bibDic
        bibDic = functions.loadBib(v)
        # append an item (link) to the list linkList
        linkList.append(
            "<a href=\"{0}/pages/DETAILS.html\">[{1}]</a> {2} ({3}) - <i>{4}</i>"
            .format(os.path.join(k[0], k[:2], k), k, bibDic[k]["author"],
                    bibDic[k]["date"], bibDic[k]["title"]))
    # sort the list linkList
    linkListSorted = sorted(linkList)
    # join items of linkListSorted by </li><li> and store in a sting
    linkList = "</li><li>".join(linkListSorted)

    # save template to pageTemp
    pageTemp = template
    # replace @MAINCONTENT@ with linkList and save it to pageTemp
    pageTemp = pageTemp.replace("@MAINCONTENT@", linkList)

    # path to contents.html
    directory = os.path.join(memexPath, "contents.html")
    # create the file contents.html
    with open(directory, "w", encoding="utf8") as f2:
        f2.write(pageTemp)
def tfidfPublications(pathToMemex, PageOrPubl):
    print("\tProcessing: %s" % PageOrPubl)
    # PART 1: loading OCR files into a corpus
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json")
    citeKeys = list(ocrFiles.keys())  #[:500]

    print("\taggregating texts into documents...")
    corpusDic = {}
    for citeKey in citeKeys:
        docData = json.load(open(ocrFiles[citeKey]))
        for page, text in docData.items():
            # text as a document
            if PageOrPubl == "publications":
                if citeKey not in corpusDic:
                    corpusDic[citeKey] = []
                corpusDic[citeKey].append(text)

            # page cluster as a document
            elif PageOrPubl == "pages":
                pageNum = int(page)
                citeKeyNew = "%s_%05d" % (citeKey, roundUp(
                    pageNum, clusterSize))
                if citeKeyNew not in corpusDic:
                    corpusDic[citeKeyNew] = []
                corpusDic[citeKeyNew].append(text)

                # add the last page of cluster N to cluster N+1
                if pageNum % clusterSize == 0:
                    citeKeyNew = "%s_%05d" % (
                        citeKey, roundUp(pageNum + 1, clusterSize))
                    if citeKeyNew not in corpusDic:
                        corpusDic[citeKeyNew] = []
                    corpusDic[citeKeyNew].append(text)
            else:
                sys.exit(
                    "`PageOrPubl` parameter must be `publications` or `pages`")

    print("\t%d documents (%s) generated..." % (len(corpusDic), PageOrPubl))
    print("\tpreprocessing the corpus...")

    docList = []
    docIdList = []

    for docId, docText in corpusDic.items():
        if len(
                docText
        ) > 2:  # cluster of two pages mean that we would drop one last page
            doc = " ".join(docText)
            # clean doc
            doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc)
            doc = re.sub('\W+', ' ', doc)
            doc = re.sub('_+', ' ', doc)
            doc = re.sub('\d+', ' ', doc)
            doc = re.sub(' +', ' ', doc)
            # we can also drop documents with a small number of words
            # (for example, when there are many illustrations)
            # let's drop clusters that have less than 1,000 words (average for 6 pages ±2500-3000 words)
            if len(doc.split(" ")) > 1000:
                # update lists
                docList.append(doc)
                docIdList.append(docId)

    # PART 3: calculate tfidf for all loaded publications and distances
    print("\tgenerating tfidf matrix & distances...")
    stopWords = functions.loadMultiLingualStopWords(
        ["eng", "deu", "fre", "spa"])
    vectorizer = CountVectorizer(ngram_range=(1, 1),
                                 min_df=5,
                                 max_df=0.5,
                                 stop_words=stopWords)
    countVectorized = vectorizer.fit_transform(docList)
    tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    vectorized = tfidfTransformer.fit_transform(
        countVectorized)  # generates a sparse matrix
    cosineMatrix = cosine_similarity(vectorized)

    # PART 4: saving TFIDF --- only for publications!
    if PageOrPubl == "publications":
        print("\tsaving tfidf data...")
        tfidfTable = pd.DataFrame(vectorized.toarray(),
                                  index=docIdList,
                                  columns=vectorizer.get_feature_names())
        tfidfTable = tfidfTable.transpose()
        print("\ttfidfTable Shape: ", tfidfTable.shape)
        tfidfTableDic = tfidfTable.to_dict()

        tfidfTableDicFilt = filterTfidfDictionary(tfidfTableDic, 0.05, "more")
        pathToSave = os.path.join(pathToMemex,
                                  "results_tfidf_%s.dataJson" % PageOrPubl)
        with open(pathToSave, 'w', encoding='utf8') as f9:
            json.dump(tfidfTableDicFilt,
                      f9,
                      sort_keys=True,
                      indent=4,
                      ensure_ascii=False)

    # PART 4: saving cosine distances --- for both publications and page clusters
    print("\tsaving cosine distances data...")
    cosineTable = pd.DataFrame(cosineMatrix)
    print("\tcosineTable Shape: ", cosineTable.shape)
    cosineTable.columns = docIdList
    cosineTable.index = docIdList
    cosineTableDic = cosineTable.to_dict()

    tfidfTableDicFilt = filterTfidfDictionary(cosineTableDic, 0.25, "more")
    pathToSave = os.path.join(pathToMemex,
                              "results_cosineDist_%s.dataJson" % PageOrPubl)
    with open(pathToSave, 'w', encoding='utf8') as f9:
        json.dump(tfidfTableDicFilt,
                  f9,
                  sort_keys=True,
                  indent=4,
                  ensure_ascii=False)
def tfidfPublications(pathToMemex):
    # PART 1: loading OCR files into a corpus
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json")
    citeKeys = list(ocrFiles.keys())#[:500]

    print("\taggregating texts into documents...")
    docList   = []
    docIdList = []

    for citeKey in citeKeys:
        docData = json.load(open(ocrFiles[citeKey]))
        # IF YOU ARE ON WINDOWS, THE LINE SHOULD BE:
        # docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8"))
        
        docId = citeKey
        doc   = " ".join(docData.values())

        # clean doc
        doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc)
        doc = re.sub('\W+', ' ', doc)
        doc = re.sub('_+', ' ', doc)
        doc = re.sub('\d+', ' ', doc)
        doc = re.sub(' +', ' ', doc)

        # update lists
        docList.append(doc)
        docIdList.append(docId)

    print("\t%d documents generated..." % len(docList))

    # PART 2: calculate tfidf for all loaded publications and distances
    print("\tgenerating tfidf matrix & distances...")
    vectorizer = CountVectorizer(ngram_range=(1,1), min_df=5, max_df=0.5)
    countVectorized = vectorizer.fit_transform(docList)
    tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    vectorized = tfidfTransformer.fit_transform(countVectorized) # generates a sparse matrix
    cosineMatrix = cosine_similarity(vectorized)

    # PART 3: saving TFIDF
    print("\tsaving tfidf data...")
    tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names())
    tfidfTable = tfidfTable.transpose()
    print("\ttfidfTable Shape: ", tfidfTable.shape)
    tfidfTableDic = tfidfTable.to_dict()

    tfidfTableDicFilt = filterTfidfDictionary(tfidfTableDic, 0.05, "more")
    pathToSave = os.path.join(pathToMemex, "results_tfidf.dataJson")
    with open(pathToSave, 'w', encoding='utf8') as f9:
        json.dump(tfidfTableDicFilt, f9, sort_keys=True, indent=4, ensure_ascii=False)

    # PART 3: saving cosine distances
    print("\tsaving cosine distances data...")
    cosineTable = pd.DataFrame(cosineMatrix)
    print("\tcosineTable Shape: ", cosineTable.shape)
    cosineTable.columns = docIdList
    cosineTable.index = docIdList
    cosineTableDic = cosineTable.to_dict()

    tfidfTableDicFilt = filterTfidfDictionary(cosineTableDic, 0.25, "more")
    pathToSave = os.path.join(pathToMemex, "results_cosineDist.dataJson")
    with open(pathToSave, 'w', encoding='utf8') as f9:
        json.dump(tfidfTableDicFilt, f9, sort_keys=True, indent=4, ensure_ascii=False)
def processAllRecords(pathToMemex): 
    files = functions.dicOfRelevantFiles(pathToMemex, ".bib") #take the bibFiles
    for citeKey, pathToBibFile in files.items(): #loop through them
        if os.path.exists(pathToBibFile.replace(".bib", ".json")): #search for files with json extension
            generatePublicationInterface(citeKey, pathToBibFile) #execute the previous function
def generateTfIdfWordClouds(pathToMemex):
    # PART 1: loading OCR files into a corpus
    ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json")
    citeKeys = list(ocrFiles.keys())  #[:500]

    print("\taggregating texts into documents...")
    docList = []
    docIdList = []

    for citeKey in citeKeys:
        docData = json.load(open(ocrFiles[citeKey], "r", encoding="UTF8"))

        docId = citeKey
        doc = " ".join(docData.values())

        # clean doc
        doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc)
        doc = re.sub('\W+', ' ', doc)
        doc = re.sub('_+', ' ', doc)
        doc = re.sub('\d+', ' ', doc)
        doc = re.sub(' +', ' ', doc)

        # update lists
        docList.append(doc)
        docIdList.append(docId)

    print("\t%d documents generated..." % len(docList))

    # PART 2: calculate tfidf for all loaded publications and distances
    print("\tgenerating tfidf matrix & distances...")

    vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=2, max_df=0.5)
    countVectorized = vectorizer.fit_transform(docList)
    tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    vectorized = tfidfTransformer.fit_transform(
        countVectorized)  # generates a sparse matrix

    print("\tconverting and filtering tfidf data...")
    tfidfTable = pd.DataFrame(vectorized.toarray(),
                              index=docIdList,
                              columns=vectorizer.get_feature_names())
    tfidfTable = tfidfTable.transpose()
    tfidfTableDic = tfidfTable.to_dict()
    tfidfTableDic = filterTfidfDictionary(tfidfTableDic, 0.02, "more")

    #tfidfTableDic = json.load(open("/Users/romanovienna/Dropbox/6.Teaching_New/BUILDING_MEMEX_COURSE/_memex_sandbox/_data/results_tfidf_publications.dataJson"))

    # PART 4: generating wordclouds
    print("\tgenerating wordclouds...")
    wc = WordCloud(
        width=1000,
        height=600,
        background_color="white",
        random_state=2,
        relative_scaling=
        0.5,  #color_func=lambda *args, **kwargs: (179,0,0)) # single color
        #colormap="copper") # Oranges, Reds, YlOrBr, YlOrRd, OrRd; # copper
        colormap="gray")  # binary, gray
    # https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html

    counter = len(tfidfTableDic)
    citeKeys = list(tfidfTableDic.keys())
    random.shuffle(citeKeys)

    for citeKey in citeKeys:
        savePath = functions.generatePublPath(pathToMemex, citeKey)
        savePath = os.path.join(savePath, "%s_wCloud.jpg" % citeKey)

        if not os.path.isfile(savePath):
            wc.generate_from_frequencies(tfidfTableDic[citeKey])
            # plotting
            plt.imshow(wc, interpolation="bilinear")
            plt.axis("off")
            #plt.show() # this line shows the plot
            plt.savefig(savePath, dpi=200, bbox_inches='tight')

            print("\t%s (%d left...)" % (citeKey, counter))
            counter -= 1

        else:
            print("\t%s --- already done" % (citeKey))
            counter -= 1
def searchOCRresults(
        pathToMemex,
        searchString):  #function to search Memex for specific keyword(s)
    print("SEARCHING FOR: `%s`" %
          searchString)  #print statement for convenience
    files = functions.dicOfRelevantFiles(
        pathToMemex, ".json"
    )  #use pre-defined function to build a dictionary from all the ocred files
    results = {}  #empty dic

    for citationKey, pathToJSON in files.items(
    ):  #loop through the ocred files individually by citekey
        data = json.load(open(pathToJSON))  #save path to ocred file
        #print(citationKey)
        count = 0  #count variable

        for pageNumber, pageText in data.items(
        ):  #loop through saved ocred files by page and text
            if re.search(
                    r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE
            ):  #search for the searchstring in the text of the page; \b matches the empty string at the beginning or end of a word
                if citationKey not in results:  #create new entry in results dic if not already present
                    results[citationKey] = {}

                # relative path
                a = citationKey.lower()  #make citekey lowercase
                relPath = os.path.join(
                    a[:1], a[:2], citationKey, "pages", "%s.html" %
                    pageNumber)  #create path to publication's html files
                countM = len(
                    re.findall(r"\b%s\b" % searchString,
                               pageText,
                               flags=re.IGNORECASE)
                )  #count variable for findings on age
                pageWithHighlights = re.sub(
                    r"\b(%s)\b" % searchString,
                    r"<span class='searchResult'>\1</span>",
                    pageText,
                    flags=re.IGNORECASE
                )  #change hmtl to highlight searchword(s)

                results[citationKey][pageNumber] = {}
                results[citationKey][pageNumber][
                    "pathToPage"] = relPath  #add path to html page with found search to results dic
                results[citationKey][pageNumber][
                    "matches"] = countM  #save count of findings on page
                results[citationKey][
                    pageNumber]["result"] = pageWithHighlights.replace(
                        "\n", "<br>")  #change html

                count += 1  #add 1 to first count variable

        if count > 0:  #if at least 1 occurrence of the searchword(s) is found
            print(
                "\t", citationKey, " : ", count
            )  #give print statement with citekey + number of total findings in this publication
            newKey = "%09d::::%s" % (
                count, citationKey
            )  #new variable defined with count and citekey
            results[newKey] = results.pop(
                citationKey)  #replace citekey with the newKey variable

            # add time stamp
            currentTime = datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')  #get current timestamp
            results["timestamp"] = currentTime  #add time to results dic
            results[
                "searchString"] = searchString  # add search string (as submitted)

    saveWith = re.sub(
        "\W+", "", searchString
    )  #replace replace non-word character (1+) with empty string in searchword
    saveTo = os.path.join(
        pathToMemex, "searches", "%s.searchResults" %
        saveWith)  #create path to folder in which the searches are saved
    with open(saveTo, 'w', encoding='utf8') as f9c:
        json.dump(results, f9c, sort_keys=True, indent=4,
                  ensure_ascii=False)  #save sorted search results in new file
Beispiel #26
0
def generatetfidfValues():

    #dictionary with keys
    ocrFiles = functions.dicOfRelevantFiles(memexPath, ".json")
    #list with citekeys (in fixed order)
    citeKeys = list(ocrFiles.keys())

    docList   = []
    docIdList = []

    #print(ocrFiles)
    #print(citeKeys)
    #loop through list not dictionary to have sorted lists (for the corpusDic)
    for citeKey in citeKeys:
        docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8"))
        #print(docData)
    
        docId = citeKey
        doc   = " ".join(docData.values())

        doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc)
        doc = re.sub('\W+', ' ', doc)
        doc = re.sub('\d+', ' ', doc)
        doc = re.sub(' +', ' ', doc)

        docList.append(doc)
        docIdList.append(docId)
    
    #print(docList)
    #print(docIdList)
    vectorizer = CountVectorizer(ngram_range=(1,1), min_df=5, max_df=0.5, stop_words= stopwordsList)
    countVectorized = vectorizer.fit_transform(docList)
    tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    vectorized = tfidfTransformer.fit_transform(countVectorized) # https://en.wikipedia.org/wiki/Sparse_matrix
    cosineMatrix = cosine_similarity(vectorized)

    tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names())
    print("tfidfTable Shape: ", tfidfTable.shape) # optional
    tfidfTable = tfidfTable.transpose()
    tfidfTableDic = tfidfTable.to_dict()
    

    cosineTable = pd.DataFrame(cosineMatrix)
    print("cosineTable Shape: ", cosineTable.shape) # optional
    cosineTable.columns = docIdList
    cosineTable.index = docIdList
    cosineTableDic = cosineTable.to_dict()
    
    #create empty dictionary
    #keywordsDic = {}
    #loop through dictionary
    #for docId in tfidfTableDic:
    #    for tfIdf in value:
            #check if tfidf value is above threshold
    #        if tfIdf >= 0.05:
    #            keywordsDic= keywordsDic.keys(docId)

    filteredDic = {}
    filteredDic = functions.filterDic(tfidfTableDic, 0.05)
    with open("tfidfTableDic_filtered.txt", 'w', encoding='utf8') as f9:
        json.dump(filteredDic, f9, sort_keys=True, indent=4, ensure_ascii=False)
    
    filteredDic = {}
    filteredDic = functions.filterDic(cosineTableDic, 0.25)
    with open("cosineTableDic_filtered.txt", 'w', encoding='utf8') as f9:
        json.dump(filteredDic, f9, sort_keys=True, indent=4, ensure_ascii=False)
Beispiel #27
0
def processAll(pathToMemex):
    pathData = functions.dicOfRelevantFiles(memexPath, ".bib")
    print(pathData)

    for k, v in pathData.items():
        generatePublicationInterface(k, v)
Beispiel #28
0
def processAllRecords(pathToMemex):
    files = functions.dicOfRelevantFiles(pathToMemex, ".bib")
    for citeKey, pathToBibFile in files.items():
        if os.path.exists(pathToBibFile.replace(".bib", ".json")):
            generatePublicationInterface(citeKey, pathToBibFile)
def processAllRecords(pathToMemex):
    files = functions.dicOfRelevantFiles(pathToMemex, ".bib")
    for citeKey, pathToBibFile in files.items():
        #print(citeKey)
        generatePublicationInterface(citeKey, pathToBibFile)
    generateMemexStartingPages(pathToMemex)
def tfidfPublications(pathToMemex, PageOrPubl):  #create the tfidf-dictionary
    print("\tProcessing: %s" %
          PageOrPubl)  #prints processing and page of the publication
    # PART 1: loading OCR files into a corpus
    ocrFiles = functions.dicOfRelevantFiles(
        pathToMemex, ".json"
    )  #generates a dictionary with citekeys as keys and paths to json-Files as values
    citeKeys = list(ocrFiles.keys())  #[:500] #creates a list with the citeKeys

    print("\taggregating texts into documents..."
          )  #prints to inform about the processing
    corpusDic = {}  #creates an empty list
    for citeKey in citeKeys:  #loops throught the citekeys
        docData = json.load(open(ocrFiles[citeKey], "r",
                                 encoding="utf8"))  #loads the OCRed documents
        for page, text in docData.items():  #loops through the OCRed documents
            # text as a document
            if PageOrPubl == "publications":  #if there is "publication"
                if citeKey not in corpusDic:  #if there is not a citekey in the corpusDic
                    corpusDic[citeKey] = []  #creates empty list for citekeys
                corpusDic[citeKey].append(text)  #appends the citeKeys

            # page cluster as a document
            elif PageOrPubl == "pages":  #if there are "pages"
                pageNum = int(page)  #returns the integrer page from the docs
                citeKeyNew = "%s_%05d" % (
                    citeKey, roundUp(pageNum, clusterSize)
                )  #creates a new cite key with page numbers and cluster Size
                if citeKeyNew not in corpusDic:  #if it is not in the dictionary
                    corpusDic[citeKeyNew] = []  #create a new dictionary
                corpusDic[citeKeyNew].append(
                    text)  #append the text to citekeynew

                # add the last page of cluster N to cluster N+1
                if pageNum % clusterSize == 0:  #if the page number cluser size is 0
                    citeKeyNew = "%s_%05d" % (
                        citeKey, roundUp(pageNum + 1, clusterSize)
                    )  #creates a new cite key with page numbers +1 and cluster Size
                    if citeKeyNew not in corpusDic:  #of citekeynew is not in the dictionary
                        corpusDic[citeKeyNew] = [
                        ]  #create a new corpusDic dictionary
                    corpusDic[citeKeyNew].append(text)  #append the text to it
            else:
                sys.exit(
                    "`PageOrPubl` parameter must be `publications` or `pages`"
                )  #if not, exit the program

    print("\t%d documents (%s) generated..." % (len(corpusDic), PageOrPubl)
          )  #print documents are generated and pages of the publication
    print("\tpreprocessing the corpus...")  #print processing of the corpus

    docList = []  #create an empty dictionary
    docIdList = []  #create an empty list for the citeKe
    for docId, docText in corpusDic.items():
        if len(
                docText
        ) > 2:  # cluster of two pages mean that we would drop one last page
            doc = " ".join(docText)  #take the text of each publication
            # clean doc
            doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc)
            doc = re.sub('\W+', ' ', doc)
            doc = re.sub('_+', ' ', doc)
            doc = re.sub('\d+', ' ', doc)
            doc = re.sub(
                ' +', ' ', doc
            )  #clean your content with the help of regular expressions, especially remove unneccessary blanks and signs)
            # we can also drop documents with a small number of words
            # (for example, when there are many illustrations)
            # let's drop clusters that have less than 1,000 words (average for 6 pages ±2500-3000 words)
            if len(doc.split(" ")) > 1000:
                # update lists
                docList.append(
                    doc
                )  #add the content of each publication to the first list
                docIdList.append(
                    docId
                )  #add the citeKey of each publication to the second key

    # PART 3: calculate tfidf for all loaded publications and distances
    print("\tgenerating tfidf matrix & distances...")  #print the phrase
    #stopWords = functions.loadMultiLingualStopWords(["eng", "deu", "fre", "spa"])
    vectorizer = CountVectorizer(
        ngram_range=(1, 1), min_df=5, max_df=0.5, stop_words=stopwordsList
    )  #create a vectorizer (use only unigrams, use only words that appear in at least five documents, use only words that appear in less than half of all documents)
    countVectorized = vectorizer.fit_transform(docList)  #create the vectors
    tfidfTransformer = TfidfTransformer(smooth_idf=True,
                                        use_idf=True)  #adjust the transformer
    vectorized = tfidfTransformer.fit_transform(
        countVectorized)  # generates a sparse matrix
    cosineMatrix = cosine_similarity(
        vectorized)  #generate a matrix with cosine distance values

    # PART 4: saving TFIDF --- only for publications!
    if PageOrPubl == "publications":  #if there is "publivation in PageorPubl
        print("\tsaving tfidf data...")  #print phrase
        tfidfTable = pd.DataFrame(vectorized.toarray(),
                                  index=docIdList,
                                  columns=vectorizer.get_feature_names()
                                  )  #transform the matrix into a dataframe
        tfidfTable = tfidfTable.transpose(
        )  #transposes rows and columns for document and information
        print("\ttfidfTable Shape: ",
              tfidfTable.shape)  #prints the dataframe shape
        tfidfTableDic = tfidfTable.to_dict(
        )  #creates a dictionary with the tfidf-values

        tfidfTableDicFilt = filterTfidfDictionary(
            tfidfTableDic, 0.05, "more"
        )  #previously defined function only for the tf-idf dictionar including only the tf-idf value higher than 0.05
        pathToSave = os.path.join(
            pathToMemex, "results_tfidf_%s.dataJson" %
            PageOrPubl)  #creates the filepath and filename
        with open(pathToSave, 'w',
                  encoding='utf8') as f9:  #opens pathToSave and writes into it
            json.dump(
                tfidfTableDicFilt,
                f9,
                sort_keys=True,
                indent=4,
                ensure_ascii=False
            )  #creates the json-File which saves your filtered tfidf dictionary

    # PART 4: saving cosine distances --- for both publications and page clusters
    print("\tsaving cosine distances data...")  #prints the phrase
    cosineTable = pd.DataFrame(
        cosineMatrix)  #the metrix transformed into dataframe
    print("\tcosineTable Shape: ",
          cosineTable.shape)  #prints the cosineTable shape
    cosineTable.columns = docIdList  #Takes the list with the citeKeys as columns
    cosineTable.index = docIdList  #Takes the list with the CiteKeys as index
    cosineTableDic = cosineTable.to_dict(
    )  #creates a dictionary with the cosine similarity

    tfidfTableDicFilt = filterTfidfDictionary(
        cosineTableDic, 0.25, "more"
    )  #previously defined function meassuring cosine similarities dictionary including only publications with a cosine similarity value higher than 0.25
    pathToSave = os.path.join(
        pathToMemex, "results_cosineDist_%s.dataJson" %
        PageOrPubl)  #creates the filepath and the filename
    with open(pathToSave, 'w', encoding='utf8') as f9:
        json.dump(
            tfidfTableDicFilt,
            f9,
            sort_keys=True,
            indent=4,
            ensure_ascii=False
        )  #creates the json-File saving the filtered cosine similarities dict