def formatPublList( pathToMemex ): #define a function for the formatting of the publications ocrFiles = functions.dicOfRelevantFiles( pathToMemex, settings["ocr_results"]) #take the files with the OCRed pages bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib") #take the bibFiles contentsList = [] #create an empty list for key, value in ocrFiles.items(): #loop through the OCRed pages if key in bibFiles: #search for the key in the bibFile bibRecord = functions.loadBib( bibFiles[key]) #load the bibliographical data for this item bibRecord = bibRecord[key] #take the key relativePath = functions.generatePublPath( pathToMemex, key).replace(pathToMemex, "") #take the relative path to the publication authorOrEditor = "[No data]" #take no information on the author as default setting if "editor" in bibRecord: #check if there is information about the editor authorOrEditor = bibRecord["editor"] #insert it if "author" in bibRecord: #check if there is information about the author authorOrEditor = bibRecord["author"] #insert it date = bibRecord["year"][:4] #insert the year of the publication title = bibRecord["title"] #insert the title # formatting template citeKey = '<div class="ID">[%s]</div>' % key #take the citeKey publication = '%s (%s) <i>%s</i>' % ( authorOrEditor, date, title ) #take the information about the publication and format it search = unicodedata.normalize('NFKD', publication).encode( 'ascii', 'ignore' ) #replace diacritical characters with their ascii equivalents publication += " <div class='hidden'>%s</div>" % search #repeat the information and hide it link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath #add the link to the details page of each publication singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % ( link, citeKey, publication ) #collect the information in a single template recordToAdd = singleItemTemplate.replace("{", "").replace( "}", "") #remove curly brackets contentsList.append( recordToAdd) #add the single records to the content list contents = "\n".join(sorted(contentsList)) #join the sorted content list final = publicationsTemplate.replace( "@TABLECONTENTS@", contents ) #replace the wildcard in the template with the actual content return (final) #return this variable
def generateContentsList(): relDic = functions.dicOfRelevantFiles(memexPath, "bib") contentsList = [] for k, v in relDic.items(): k = k[:-1] bibDic = functions.loadBib(v) authorOrEditor = "[No data]" if "editor" in bibDic[k]: authorOrEditor = bibDic[k]["editor"] if "author" in bibDic[k]: authorOrEditor = bibDic[k]["author"] publication = "{0} ({1}) <i>{2}</i>".format(authorOrEditor, bibDic[k]["date"], bibDic[k]["title"]) search = unicodedata.normalize('NFKD', publication).encode('ascii', 'ignore') publication += " <div class=\"hidden\">{0}</div>".format(search) contentsList.append( "<tr><td><div class=\"ID\"><a href=\"{0}/pages/DETAILS.html\">[{1}]</a></div> {2}</td></tr>" .format(os.path.join(k[0], k[:2], k), k, publication)) contentsListSorted = sorted(contentsList) contentsList = "".join(contentsListSorted) mainElement = publicationsTemplate.replace("@TABLECONTENTS@", contentsList) return (mainElement)
def processAll(path_to_memex): pathData = functions.dicOfRelevantFiles(memexPath, ".bib") print(pathData) #bibData = functions.loadBib(settings["bib_all"]) for k, v in pathData.items(): generatePublicationInterface(k, v)
def search(): ## load OCR results ocrFiles = functions.dicOfRelevantFiles(memexPath, ".json") citeKeys = list(ocrFiles.keys()) #word = input("Please enter a word:" ) dicOfMatches = {} # dictionary with citeKeys as value, matches as value ## loop through OCR results for citeKeys, word in ocrFiles.items(): val = json.load(open(ocrFiles[citeKeys],"r",encoding= "utf8")) #print(val) val = dictionary with key: pagenumber and value: pagecontent for pagenumbers, pagecontent in val: dicOfMatches = {} for k in v: if "christianity" in v: print("yes") else: print("notinthepage") print (dicOfMatches)
def searchOCRresults(pathToMemex, searchString): print("SEARCHING FOR: `%s`" % searchString) #to keep track of what we are doing files = functions.dicOfRelevantFiles(pathToMemex, ".json") #takes every file with OCR results results = {} ## create dictionary for citationKey, pathToJSON in files.items(): # loop through dictionary; all the files data = json.load(open(pathToJSON)) ## load results #print(citationKey) count = 0 for pageNumber, pageText in data.items(): #loop for specific file in there if re.search(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE): # search; flags for re to ignore case if citationKey not in results: results[citationKey] = {} # relative path a = citationKey.lower() relPath = os.path.join(a[:1], a[:2], citationKey, "pages", "%s.html" % pageNumber) # page to html.page, shows specific page in dictionary - klick on link to get to page countM = len(re.findall(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE)) # count how many matches are on the page; re.findall command; ignore case for better results pageWithHighlights = re.sub(r"\b(%s)\b" % searchString, r"<span class='searchResult'>\1</span>", pageText, flags=re.IGNORECASE)#take our page and wrap the match into html, assign a class, add into css file results[citationKey][pageNumber] = {} # create empty dic for each page results[citationKey][pageNumber]["pathToPage"] = relPath # add path to page results[citationKey][pageNumber]["matches"] = countM # number of matches results[citationKey][pageNumber]["result"] = pageWithHighlights.replace("\n", "<br>") # and formated page as results count += 1 #count of number of searches that we run; can be done within the loop if count > 0: # reformate the results - thats why we want the count; not necessary - only to help with organizing search results print("\t", citationKey, " : ", count) # keep track of what is going on newKey = "%09d::::%s" % (count, citationKey) #creating a new key for each publication, combines frequency and citationkey results[newKey] = results.pop(citationKey) # removes an item from the dictionary - pop; command removes the old item and creates a new one at the same time # add time stamp; get datetime library currentTime = datetime.now().strftime('%Y-%m-%d %H:%M:%S')# creates variable - formate the time in the red; string results["timestamp"] = currentTime #add extra item to dictionary; the timestampt with the actual time # add search string (as submitted) results["searchString"] = searchString # add searchstring to dictionary saveWith = re.sub("\W+", "", searchString) ## save the results, take the searchstring and remove all word characters saveTo = os.path.join(pathToMemex, "searches", "%s.searchResults" % saveWith) # create save path, putting search results in subfolder, assign extension - unique so that it does not exist already with open(saveTo, 'w', encoding='utf8') as f9c: #save results json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False) # json dump because it is a dictionary
def processAllRecords(): # call the function dicOfRelevantFiles with memexPath as input value and save the return value to relDic relDic = functions.dicOfRelevantFiles(memexPath, "bib") # loop through all items of the dictionry relDic for k, v in relDic.items(): # call the function generatePublicationInterface with k (removing the last character) and v as input values generatePublicationInterface(k[:-1], v)
def search(searchArgument): targetFiles = functions.dicOfRelevantFiles(memexPath, ".json") citeKeys = list(targetFiles.keys()) #searchArgument = input("What are you looking for: ") results = {} for citeKey in citeKeys: #loop trough all the keys docData = json.load( open(targetFiles[citeKey], "r", encoding="utf8" )) #load the respective json file with the ocr results for k, v in docData.items(): #keys = page numbers values = text if searchArgument in v: #if the search Argument is in the page matchCounter = len(re.findall(searchArgument, v)) #count how often if not citeKey in results.keys( ): #creates an empty dict only if there isnt allready one results[citeKey] = {} results[citeKey][k] = { } #creates sub-dict with the page number as key results[citeKey][k][ "matches"] = matchCounter #at the key matches the number of matches pagePath = os.path.join( functions.generatePublPath(memexPath, citeKey), "pages\\", k + ".html") #creates the path to the html file for the page results[citeKey][k]["pathToPage"] = pagePath results[citeKey][k][ "result"] = v #adds the ocred text to the dict with open("search.txt", 'w', encoding='utf8') as f9: #saves it into a file too json.dump(results, f9, sort_keys=True, indent=4, ensure_ascii=False) return (results)
def formatPublList(pathToMemex): ocrFiles = functions.dicOfRelevantFiles(pathToMemex, settings["ocr_results"]) bibFiles = functions.dicOfRelevantFiles(pathToMemex, ".bib") contentsList = [] for key, value in ocrFiles.items(): if key in bibFiles: bibRecord = functions.loadBib(bibFiles[key]) bibRecord = bibRecord[key] relativePath = functions.generatePublPath(pathToMemex, key).replace( pathToMemex, "") authorOrEditor = "[No data]" if "editor" in bibRecord: authorOrEditor = bibRecord["editor"] if "author" in bibRecord: authorOrEditor = bibRecord["author"] date = "nodate" if "year" in bibRecord: date = bibRecord["year"] title = bibRecord["title"] # formatting template citeKey = '<div class="ID">[%s]</div>' % key publication = '%s (%s) <i>%s</i>' % (authorOrEditor, date, title) search = unicodedata.normalize('NFKD', publication).encode( 'ascii', 'ignore') publication += " <div class='hidden'>%s</div>" % search link = '<a href="%s/pages/DETAILS.html"><i>read</i></a>' % relativePath singleItemTemplate = '<tr><td>%s</td><td>%s %s</td></tr>' % ( link, citeKey, publication) recordToAdd = singleItemTemplate.replace("{", "").replace("}", "") contentsList.append(recordToAdd) contents = "\n".join(sorted(contentsList)) final = publicationsTemplate.replace("@TABLECONTENTS@", contents) return (final)
def processAllRecords(pathToMemex): #defines the process all records function files = functions.dicOfRelevantFiles(pathToMemex, ".bib") #takes the bibFiles for citeKey, pathToBibFile in files.items(): #loops through them if os.path.exists( pathToBibFile.replace(".bib", ".json") ): #search for the files with .bib and .json extentions generatePublicationInterface(citeKey, pathToBibFile) #starts the function
def formatSearches(pathToMemex): with open(settings["template_search"], "r", encoding="utf8") as f1: indexTmpl = f1.read() dof = functions.dicOfRelevantFiles( pathToMemex, ".searchResults" ) #returns a dict of links with all files with the .searchResults ending # format individual search pages toc = [] for file, pathToFile in dof.items(): searchResults = [] data = json.load(open((pathToFile), "r", encoding="utf8")) # collect toc template = "<tr> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td></tr>" # variables linkToSearch = os.path.join("searches", file + ".html") pathToPage = '<a href="%s"><i>read</i></a>' % linkToSearch searchString = '<div class="searchString">%s</div>' % data.pop( "searchString") timeStamp = data.pop("timestamp") tocItem = template % (pathToPage, searchString, len(data), timeStamp) toc.append(tocItem) # generate the results page keys = sorted(data.keys(), reverse=True) for k in keys: searchResSingle = [] results = data[k] temp = k.split("::::") header = "%s (pages with results: %d)" % (temp[1], int(temp[0])) #print(header) for page, excerpt in results.items(): #print(excerpt["result"]) pdfPage = int(page) linkToPage = '<a href="../%s"><i>go to the original page...</i></a>' % excerpt[ "pathToPage"] searchResSingle.append( "<li><b><hr>(pdfPage: %d)</b><hr> %s <hr> %s </li>" % (pdfPage, excerpt["result"], linkToPage)) searchResSingle = "<ul>\n%s\n</ul>" % "\n".join(searchResSingle) searchResSingle = generalTemplate.replace("@ELEMENTHEADER@", header).replace( "@ELEMENTCONTENT@", searchResSingle) searchResults.append(searchResSingle) searchResults = "<h2>SEARCH RESULTS FOR: <i>%s</i></h2>\n\n" % searchString + "\n\n".join( searchResults) with open(pathToFile.replace(".searchResults", ".html"), "w", encoding="utf8") as f9: f9.write(indexTmpl.replace("@MAINCONTENT@", searchResults)) #os.remove(pathToFile) #input("\n".join(toc)) toc = searchesTemplate.replace("@TABLECONTENTS@", "\n".join(toc)) return (toc)
def searchOCRresults(pathToMemex, searchString): print("SEARCHING FOR: `%s`" % searchString) # run the function (from previous step) that creates a dictionary "files" of the paths to the json files. Create an empty dictionary. files = functions.dicOfRelevantFiles(pathToMemex, ".json") results = {} # run a loop for each item of the dictionary "files", so that a json file is created from each path for citationKey, pathToJSON in files.items(): data = json.load(open(pathToJSON)) #print(citationKey) count = 0 # for each page, i.e. key "page number" in the json file, search for matches for the search string and save in a dictionary by the citation key and page number for pageNumber, pageText in data.items(): if re.search(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE): if citationKey not in results: results[citationKey] = {} # relative path a = citationKey.lower() relPath = os.path.join(a[:1], a[:2], citationKey, "pages", "%s.html" % pageNumber) countM = len( re.findall(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE)) pageWithHighlights = re.sub( r"\b(%s)\b" % searchString, r"<span class='searchResult'>\1</span>", pageText, flags=re.IGNORECASE) results[citationKey][pageNumber] = {} results[citationKey][pageNumber]["pathToPage"] = relPath results[citationKey][pageNumber]["matches"] = countM results[citationKey][pageNumber][ "result"] = pageWithHighlights.replace("\n", "<br>") count += 1 if count > 0: print("\t", citationKey, " : ", count) newKey = "%09d::::%s" % (count, citationKey) results[newKey] = results.pop(citationKey) # add time stamp currentTime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') results["timestamp"] = currentTime # add search string (as submitted) results["searchString"] = searchString # save the search results in a 'nonsensical' path, then save them as json files saveWith = re.sub("\W+", "", searchString) saveTo = os.path.join(pathToMemex, "searches", "%s.searchResults" % saveWith) with open(saveTo, 'w', encoding='utf8') as f9c: json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False)
def searchOCRresults(pathToResults, searchString): print("SEARCHING FOR: `%s`" % searchString) files = functions.dicOfRelevantFiles(pathToMemex, ".json") results = {} for citationKey, pathToJSON in files.items(): data = json.load(open(pathToJSON, "r", encoding="utf8")) #docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8")) #print(citationKey) count = 0 for pageNumber, pageText in data.items(): if re.search(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE): if citationKey not in results: results[citationKey] = {} # relative path a = citationKey.lower() relPath = os.path.join(a[:1], a[:2], citationKey, "pages", "%s.html" % pageNumber) countM = len( re.findall(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE)) pageWithHighlights = re.sub( r"\b(%s)\b" % searchString, r"<span class='searchResult'>\1</span>", pageText, flags=re.IGNORECASE) results[citationKey][pageNumber] = {} results[citationKey][pageNumber]["pathToPage"] = relPath results[citationKey][pageNumber]["matches"] = countM results[citationKey][pageNumber][ "result"] = pageWithHighlights.replace("\n", "<br>") count += 1 if count > 0: print("\t", citationKey, " : ", count) newKey = "%09d::::%s" % (count, citationKey) results[newKey] = results.pop(citationKey) # add time stamp currentTime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') results["timestamp"] = currentTime # add search string (as submitted) results["searchString"] = searchString saveWith = re.sub("\W+", "", searchString) saveTo = os.path.join(pathToResults, "%s.searchResults" % saveWith) print(saveTo) with open(saveTo, 'w', encoding='utf8') as f9c: json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False)
def processAllRecords( pathToMemex ): # Last function processes all of the records, uses pathToMemex as an argument files = functions.dicOfRelevantFiles( pathToMemex, ".bib") # Gets all of the bib files and loops through the dictionary for citekey, pathToBibFile in files.items( ): # The citation key and the pathToBibFile are given #print(citeKey) generatePublicationInterface(citeKey, pathToBibFile) generateMemexStartingPages(pathToMemex) # Generates the starting pages.
def search(): ## load OCR results ocrFiles = functions.dicOfRelevantFiles(memexPath, ".json") citeKeys = list(ocrFiles.keys()) word = input("Please enter a word:" ) dicOfMatches = {} # dictionary with citeKeys as value, matches as value print(ocrFiles) for citeKeys, word in ocrFiles.items(): val = json.load(open(ocrFiles[citeKeys],"r",encoding= "utf8")) # load each json file dicOfPages = {} pagenumbers = list(val.keys()) pagetext= list(val) #dicOfPages[keys] = pagenumbers # didn't work #dicOfPages[val] = pagetext print(dicOfPages)
def formatSearches(pathToMemex): #defines function to format the searches with open(settings["template_search"], "r", encoding="utf8") as f1: #opens from "teplate_search" indexTmpl = f1.read() #opens the searchTemplate dof = functions.dicOfRelevantFiles(pathToMemex, ".searchResults") #chooses the files with the search results # format individual search pages toc = [] #creates an empty list for file, pathToFile in dof.items(): #loops through all the files with searches searchResults = [] #creates an empty list data = json.load(open(pathToFile, "r", encoding="utf8")) #loads the files with the search results # collect toc template = "<tr> <td>%s</td> <td>%s</td> <td>%s</td> <td>%s</td></tr>" #creates the format of the table # variables linkToSearch = os.path.join("searches", file+".html") #adds the link to searches with .html pathToPage = '<a href="%s"><i>read</i></a>' % linkToSearch #creates the link in the table to the html-file with our search results searchString = '<div class="searchString">%s</div>' % data.pop("searchString") #take the searchstring from the files with our search results timeStamp = data.pop("timestamp") #takes the timestamp tocItem = template % (pathToPage, searchString, len(data), timeStamp) #adds the variables to the template toc.append(tocItem) #adds the template to the table of contents # generate the results page keys = sorted(data.keys(), reverse=True) #sorts the citation keys with the number of pages, results in reverse order for k in keys: #loops through citation keys searchResSingle = [] #create an empty list results = data[k] #creates an empty list temp = k.split("::::") #splits the citation keys and the number of pages with results header = "%s (pages with results: %d)" % (temp[1], int(temp[0])) #creates a header for each publication with citation key and the number of pages with results #print(header) for page, excerpt in results.items(): #loops through the results #print(excerpt["result"]) pdfPage = int(page) #takes the page with the searchstring linkToPage = '<a href="../%s"><i>go to the original page...</i></a>' % excerpt["pathToPage"] #adds a link to the original page with the search result searchResSingle.append("<li><b><hr>(pdfPage: %d)</b><hr> %s <hr> %s </li>" % (pdfPage, excerpt["result"], linkToPage)) #adds the text and the link to the list searchResSingle = "<ul>\n%s\n</ul>" % "\n".join(searchResSingle) #joins the single pages together searchResSingle = generalTemplate.replace("@ELEMENTHEADER@", header).replace("@ELEMENTCONTENT@", searchResSingle) #replaces the wildcards in the headers searchResults.append(searchResSingle) #appends the results of the search searchResults = "<h2>SEARCH RESULTS FOR: <i>%s</i></h2>\n\n" % searchString + "\n\n".join(searchResults) #creates a header for the html-page and join the search results with open(pathToFile.replace(".searchResults", ".html"), "w", encoding="utf8") as f9: f9.write(indexTmpl.replace("@MAINCONTENT@", searchResults)) #creates the html-page #os.remove(pathToFile) #input("\n".join(toc)) toc = searchesTemplate.replace("@TABLECONTENTS@", "\n".join(toc)) #replaces the wildcard in the table of contents return(toc) #returns it
def searchOCRresults(pathToMemex, searchString): print("SEARCHING FOR: `%s`" % searchString) files = functions.dicOfRelevantFiles(pathToMemex, ".json") #returns us a dirct with all the json file, citeKey as KEy and the paths as values results = {} for citationKey, pathToJSON in files.items(): #loop through all of them data = json.load(open(pathToJSON)) #load the current json file ->OCRed Text #print(citationKey) count = 0 #count for pageNumber, pageText in data.items(): #page Number as key, Text as value if re.search(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE): #search each page for the searchString if citationKey not in results: results[citationKey] = {} #if in the results dict is no entry allready create and empyty sub dict with the citekey as key # relative path a = citationKey.lower() #save citekey relPath = os.path.join(a[:1], a[:2], citationKey, "pages", "%s.html" % pageNumber) #create path for the html page with the page number with a match countM = len(re.findall(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE)) #count all matches in the page pageWithHighlights = re.sub(r"\b(%s)\b" % searchString, r"<span class='searchResult'>\1</span>", pageText, flags=re.IGNORECASE) #highlight the searchstring in the results results[citationKey][pageNumber] = {} #create empty dict with the page number as key - all other data will fo into this results[citationKey][pageNumber]["pathToPage"] = relPath #add the path results[citationKey][pageNumber]["matches"] = countM #add the count results[citationKey][pageNumber]["result"] = pageWithHighlights.replace("\n", "<br>") #add the text count += 1 #count pages with results up if count > 0: #if there are results print("\t", citationKey, " : ", count) #print how many at this page newKey = "%09d::::%s" % (count, citationKey) results[newKey] = results.pop(citationKey) #add the number of matches to the citekey # add time stamp currentTime = datetime.now().strftime('%Y-%m-%d %H:%M:%S') #current time results["timestamp"] = currentTime #added as a timestamp # add search string (as submitted) results["searchString"] = searchString #add search string saveWith = re.sub("\W+", "", searchString) saveTo = os.path.join(pathToMemex, "searches", "%s.searchResults" % saveWith) with open(saveTo, 'w', encoding='utf8') as f9c: #save the search with a recognizeable name json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False)
def processAllRecordsSTR(pathToMemex): files = functions.dicOfRelevantFiles(pathToMemex, ".bib") citeKeys = list(files.keys()) random.shuffle(citeKeys) for citeKey in citeKeys: print(citeKey) bibData = functions.loadBib(files[citeKey]) if "pagetotal" in bibData: pageTotal = int(bibData["pagetotal"]) if pageTotal <= int(settings["page_limit"]): language = functions.identifyLanguage(bibData[citeKey], "eng") ocrPublication(citeKey, language, settings["page_limit"]) else: language = functions.identifyLanguage(bibData[citeKey], "eng") ocrPublication(citeKey, language, settings["page_limit"]) functions.memexStatusUpdates(settings["path_to_memex"], ".pdf") functions.memexStatusUpdates(settings["path_to_memex"], ".bib") functions.memexStatusUpdates(settings["path_to_memex"], ".png") functions.memexStatusUpdates(settings["path_to_memex"], ".json")
def generateSearchList(): searchFiles = functions.dicOfRelevantFiles(memexPath, "searchResults") queryKeys = sorted(list(searchFiles.keys())) searchList = [] for queryKey in queryKeys: docData = json.load(open(searchFiles[queryKey])) searchList.append( "<tr><td><div class=\"searchString\"><a href=\"search/{0}.html\">{1}</a></div></td><td>{2}</td><td>{3}</td></tr>" .format(re.sub(r"\W+", "", docData["searchString"]), docData["searchString"], len(docData) - 2, docData["timestamp"])) searchListSorted = sorted(searchList) searchList = "".join(searchListSorted) mainElement = searchesTemplate.replace("@TABLECONTENTS@", searchList) createSearchResultPages() return (mainElement)
def createSearchResultPages(): with open(settings["template_search"], "r", encoding="utf8") as ft: template = ft.read() dof = functions.dicOfRelevantFiles(memexPath, ".searchResults") for file, pathToFile in dof.items(): data = json.load(open(pathToFile)) contentsList = [] searchString = data["searchString"] data.pop("timestamp") data.pop("searchString") keys = sorted(data.keys(), reverse=True) for citekey in keys: recordToAdd = generalTemplate temp = citekey.split("::::") buttonHeader = '<b>{0}</b> (pages with results: {1})'.format( temp[1], int(temp[0])) recordToAdd = recordToAdd.replace("@ELEMENTHEADER@", buttonHeader) linkList = [] pages = data[citekey] for page, results in pages.items(): itemToAdd = '<li><hr><b>(pdfPage: {0})</b><hr>{1}<hr> <a href="../{2}"><i>go to the original page...</i></a></li>'.format( page, results["result"], results["pathToPage"]) linkList.append(itemToAdd) listContent = "\n<ul>\n%s\n</ul>\n" % "\n".join(linkList) recordToAdd = recordToAdd.replace("@ELEMENTCONTENT@", listContent) contentsList.append(recordToAdd) contents = "".join(contentsList) mainContent = "<h1>SEARCH RESULTS FOR: <i><div class='searchString'>" + searchString + "</div></i></h1>\n\n" + contents saveWith = re.sub(r"\W+", "", searchString) directory = os.path.join(memexPath, "search", saveWith + ".html") with open(directory, "w", encoding="utf8") as f9: f9.write(template.replace("@MAINCONTENT@", mainContent))
def generateContentsPage(): # load contents template with open(settings["template_contents"], "r", encoding="utf8") as ft: template = ft.read() # call the function dicOfRelevantFiles with memexPath as input value and save the return value to relDic relDic = functions.dicOfRelevantFiles(memexPath, "bib") # create the list linkList linkList = [] # loop through all items of the dictionry relDic for k, v in relDic.items(): # removing the last character k = k[:-1] # call the function loadBib with v as input value and save the return value to bibDic bibDic = functions.loadBib(v) # append an item (link) to the list linkList linkList.append( "<a href=\"{0}/pages/DETAILS.html\">[{1}]</a> {2} ({3}) - <i>{4}</i>" .format(os.path.join(k[0], k[:2], k), k, bibDic[k]["author"], bibDic[k]["date"], bibDic[k]["title"])) # sort the list linkList linkListSorted = sorted(linkList) # join items of linkListSorted by </li><li> and store in a sting linkList = "</li><li>".join(linkListSorted) # save template to pageTemp pageTemp = template # replace @MAINCONTENT@ with linkList and save it to pageTemp pageTemp = pageTemp.replace("@MAINCONTENT@", linkList) # path to contents.html directory = os.path.join(memexPath, "contents.html") # create the file contents.html with open(directory, "w", encoding="utf8") as f2: f2.write(pageTemp)
def tfidfPublications(pathToMemex, PageOrPubl): print("\tProcessing: %s" % PageOrPubl) # PART 1: loading OCR files into a corpus ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json") citeKeys = list(ocrFiles.keys()) #[:500] print("\taggregating texts into documents...") corpusDic = {} for citeKey in citeKeys: docData = json.load(open(ocrFiles[citeKey])) for page, text in docData.items(): # text as a document if PageOrPubl == "publications": if citeKey not in corpusDic: corpusDic[citeKey] = [] corpusDic[citeKey].append(text) # page cluster as a document elif PageOrPubl == "pages": pageNum = int(page) citeKeyNew = "%s_%05d" % (citeKey, roundUp( pageNum, clusterSize)) if citeKeyNew not in corpusDic: corpusDic[citeKeyNew] = [] corpusDic[citeKeyNew].append(text) # add the last page of cluster N to cluster N+1 if pageNum % clusterSize == 0: citeKeyNew = "%s_%05d" % ( citeKey, roundUp(pageNum + 1, clusterSize)) if citeKeyNew not in corpusDic: corpusDic[citeKeyNew] = [] corpusDic[citeKeyNew].append(text) else: sys.exit( "`PageOrPubl` parameter must be `publications` or `pages`") print("\t%d documents (%s) generated..." % (len(corpusDic), PageOrPubl)) print("\tpreprocessing the corpus...") docList = [] docIdList = [] for docId, docText in corpusDic.items(): if len( docText ) > 2: # cluster of two pages mean that we would drop one last page doc = " ".join(docText) # clean doc doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc) doc = re.sub('\W+', ' ', doc) doc = re.sub('_+', ' ', doc) doc = re.sub('\d+', ' ', doc) doc = re.sub(' +', ' ', doc) # we can also drop documents with a small number of words # (for example, when there are many illustrations) # let's drop clusters that have less than 1,000 words (average for 6 pages ±2500-3000 words) if len(doc.split(" ")) > 1000: # update lists docList.append(doc) docIdList.append(docId) # PART 3: calculate tfidf for all loaded publications and distances print("\tgenerating tfidf matrix & distances...") stopWords = functions.loadMultiLingualStopWords( ["eng", "deu", "fre", "spa"]) vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=5, max_df=0.5, stop_words=stopWords) countVectorized = vectorizer.fit_transform(docList) tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True) vectorized = tfidfTransformer.fit_transform( countVectorized) # generates a sparse matrix cosineMatrix = cosine_similarity(vectorized) # PART 4: saving TFIDF --- only for publications! if PageOrPubl == "publications": print("\tsaving tfidf data...") tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names()) tfidfTable = tfidfTable.transpose() print("\ttfidfTable Shape: ", tfidfTable.shape) tfidfTableDic = tfidfTable.to_dict() tfidfTableDicFilt = filterTfidfDictionary(tfidfTableDic, 0.05, "more") pathToSave = os.path.join(pathToMemex, "results_tfidf_%s.dataJson" % PageOrPubl) with open(pathToSave, 'w', encoding='utf8') as f9: json.dump(tfidfTableDicFilt, f9, sort_keys=True, indent=4, ensure_ascii=False) # PART 4: saving cosine distances --- for both publications and page clusters print("\tsaving cosine distances data...") cosineTable = pd.DataFrame(cosineMatrix) print("\tcosineTable Shape: ", cosineTable.shape) cosineTable.columns = docIdList cosineTable.index = docIdList cosineTableDic = cosineTable.to_dict() tfidfTableDicFilt = filterTfidfDictionary(cosineTableDic, 0.25, "more") pathToSave = os.path.join(pathToMemex, "results_cosineDist_%s.dataJson" % PageOrPubl) with open(pathToSave, 'w', encoding='utf8') as f9: json.dump(tfidfTableDicFilt, f9, sort_keys=True, indent=4, ensure_ascii=False)
def tfidfPublications(pathToMemex): # PART 1: loading OCR files into a corpus ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json") citeKeys = list(ocrFiles.keys())#[:500] print("\taggregating texts into documents...") docList = [] docIdList = [] for citeKey in citeKeys: docData = json.load(open(ocrFiles[citeKey])) # IF YOU ARE ON WINDOWS, THE LINE SHOULD BE: # docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8")) docId = citeKey doc = " ".join(docData.values()) # clean doc doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc) doc = re.sub('\W+', ' ', doc) doc = re.sub('_+', ' ', doc) doc = re.sub('\d+', ' ', doc) doc = re.sub(' +', ' ', doc) # update lists docList.append(doc) docIdList.append(docId) print("\t%d documents generated..." % len(docList)) # PART 2: calculate tfidf for all loaded publications and distances print("\tgenerating tfidf matrix & distances...") vectorizer = CountVectorizer(ngram_range=(1,1), min_df=5, max_df=0.5) countVectorized = vectorizer.fit_transform(docList) tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True) vectorized = tfidfTransformer.fit_transform(countVectorized) # generates a sparse matrix cosineMatrix = cosine_similarity(vectorized) # PART 3: saving TFIDF print("\tsaving tfidf data...") tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names()) tfidfTable = tfidfTable.transpose() print("\ttfidfTable Shape: ", tfidfTable.shape) tfidfTableDic = tfidfTable.to_dict() tfidfTableDicFilt = filterTfidfDictionary(tfidfTableDic, 0.05, "more") pathToSave = os.path.join(pathToMemex, "results_tfidf.dataJson") with open(pathToSave, 'w', encoding='utf8') as f9: json.dump(tfidfTableDicFilt, f9, sort_keys=True, indent=4, ensure_ascii=False) # PART 3: saving cosine distances print("\tsaving cosine distances data...") cosineTable = pd.DataFrame(cosineMatrix) print("\tcosineTable Shape: ", cosineTable.shape) cosineTable.columns = docIdList cosineTable.index = docIdList cosineTableDic = cosineTable.to_dict() tfidfTableDicFilt = filterTfidfDictionary(cosineTableDic, 0.25, "more") pathToSave = os.path.join(pathToMemex, "results_cosineDist.dataJson") with open(pathToSave, 'w', encoding='utf8') as f9: json.dump(tfidfTableDicFilt, f9, sort_keys=True, indent=4, ensure_ascii=False)
def processAllRecords(pathToMemex): files = functions.dicOfRelevantFiles(pathToMemex, ".bib") #take the bibFiles for citeKey, pathToBibFile in files.items(): #loop through them if os.path.exists(pathToBibFile.replace(".bib", ".json")): #search for files with json extension generatePublicationInterface(citeKey, pathToBibFile) #execute the previous function
def generateTfIdfWordClouds(pathToMemex): # PART 1: loading OCR files into a corpus ocrFiles = functions.dicOfRelevantFiles(pathToMemex, ".json") citeKeys = list(ocrFiles.keys()) #[:500] print("\taggregating texts into documents...") docList = [] docIdList = [] for citeKey in citeKeys: docData = json.load(open(ocrFiles[citeKey], "r", encoding="UTF8")) docId = citeKey doc = " ".join(docData.values()) # clean doc doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc) doc = re.sub('\W+', ' ', doc) doc = re.sub('_+', ' ', doc) doc = re.sub('\d+', ' ', doc) doc = re.sub(' +', ' ', doc) # update lists docList.append(doc) docIdList.append(docId) print("\t%d documents generated..." % len(docList)) # PART 2: calculate tfidf for all loaded publications and distances print("\tgenerating tfidf matrix & distances...") vectorizer = CountVectorizer(ngram_range=(1, 1), min_df=2, max_df=0.5) countVectorized = vectorizer.fit_transform(docList) tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True) vectorized = tfidfTransformer.fit_transform( countVectorized) # generates a sparse matrix print("\tconverting and filtering tfidf data...") tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names()) tfidfTable = tfidfTable.transpose() tfidfTableDic = tfidfTable.to_dict() tfidfTableDic = filterTfidfDictionary(tfidfTableDic, 0.02, "more") #tfidfTableDic = json.load(open("/Users/romanovienna/Dropbox/6.Teaching_New/BUILDING_MEMEX_COURSE/_memex_sandbox/_data/results_tfidf_publications.dataJson")) # PART 4: generating wordclouds print("\tgenerating wordclouds...") wc = WordCloud( width=1000, height=600, background_color="white", random_state=2, relative_scaling= 0.5, #color_func=lambda *args, **kwargs: (179,0,0)) # single color #colormap="copper") # Oranges, Reds, YlOrBr, YlOrRd, OrRd; # copper colormap="gray") # binary, gray # https://matplotlib.org/3.1.1/gallery/color/colormap_reference.html counter = len(tfidfTableDic) citeKeys = list(tfidfTableDic.keys()) random.shuffle(citeKeys) for citeKey in citeKeys: savePath = functions.generatePublPath(pathToMemex, citeKey) savePath = os.path.join(savePath, "%s_wCloud.jpg" % citeKey) if not os.path.isfile(savePath): wc.generate_from_frequencies(tfidfTableDic[citeKey]) # plotting plt.imshow(wc, interpolation="bilinear") plt.axis("off") #plt.show() # this line shows the plot plt.savefig(savePath, dpi=200, bbox_inches='tight') print("\t%s (%d left...)" % (citeKey, counter)) counter -= 1 else: print("\t%s --- already done" % (citeKey)) counter -= 1
def searchOCRresults( pathToMemex, searchString): #function to search Memex for specific keyword(s) print("SEARCHING FOR: `%s`" % searchString) #print statement for convenience files = functions.dicOfRelevantFiles( pathToMemex, ".json" ) #use pre-defined function to build a dictionary from all the ocred files results = {} #empty dic for citationKey, pathToJSON in files.items( ): #loop through the ocred files individually by citekey data = json.load(open(pathToJSON)) #save path to ocred file #print(citationKey) count = 0 #count variable for pageNumber, pageText in data.items( ): #loop through saved ocred files by page and text if re.search( r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE ): #search for the searchstring in the text of the page; \b matches the empty string at the beginning or end of a word if citationKey not in results: #create new entry in results dic if not already present results[citationKey] = {} # relative path a = citationKey.lower() #make citekey lowercase relPath = os.path.join( a[:1], a[:2], citationKey, "pages", "%s.html" % pageNumber) #create path to publication's html files countM = len( re.findall(r"\b%s\b" % searchString, pageText, flags=re.IGNORECASE) ) #count variable for findings on age pageWithHighlights = re.sub( r"\b(%s)\b" % searchString, r"<span class='searchResult'>\1</span>", pageText, flags=re.IGNORECASE ) #change hmtl to highlight searchword(s) results[citationKey][pageNumber] = {} results[citationKey][pageNumber][ "pathToPage"] = relPath #add path to html page with found search to results dic results[citationKey][pageNumber][ "matches"] = countM #save count of findings on page results[citationKey][ pageNumber]["result"] = pageWithHighlights.replace( "\n", "<br>") #change html count += 1 #add 1 to first count variable if count > 0: #if at least 1 occurrence of the searchword(s) is found print( "\t", citationKey, " : ", count ) #give print statement with citekey + number of total findings in this publication newKey = "%09d::::%s" % ( count, citationKey ) #new variable defined with count and citekey results[newKey] = results.pop( citationKey) #replace citekey with the newKey variable # add time stamp currentTime = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') #get current timestamp results["timestamp"] = currentTime #add time to results dic results[ "searchString"] = searchString # add search string (as submitted) saveWith = re.sub( "\W+", "", searchString ) #replace replace non-word character (1+) with empty string in searchword saveTo = os.path.join( pathToMemex, "searches", "%s.searchResults" % saveWith) #create path to folder in which the searches are saved with open(saveTo, 'w', encoding='utf8') as f9c: json.dump(results, f9c, sort_keys=True, indent=4, ensure_ascii=False) #save sorted search results in new file
def generatetfidfValues(): #dictionary with keys ocrFiles = functions.dicOfRelevantFiles(memexPath, ".json") #list with citekeys (in fixed order) citeKeys = list(ocrFiles.keys()) docList = [] docIdList = [] #print(ocrFiles) #print(citeKeys) #loop through list not dictionary to have sorted lists (for the corpusDic) for citeKey in citeKeys: docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8")) #print(docData) docId = citeKey doc = " ".join(docData.values()) doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc) doc = re.sub('\W+', ' ', doc) doc = re.sub('\d+', ' ', doc) doc = re.sub(' +', ' ', doc) docList.append(doc) docIdList.append(docId) #print(docList) #print(docIdList) vectorizer = CountVectorizer(ngram_range=(1,1), min_df=5, max_df=0.5, stop_words= stopwordsList) countVectorized = vectorizer.fit_transform(docList) tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True) vectorized = tfidfTransformer.fit_transform(countVectorized) # https://en.wikipedia.org/wiki/Sparse_matrix cosineMatrix = cosine_similarity(vectorized) tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names()) print("tfidfTable Shape: ", tfidfTable.shape) # optional tfidfTable = tfidfTable.transpose() tfidfTableDic = tfidfTable.to_dict() cosineTable = pd.DataFrame(cosineMatrix) print("cosineTable Shape: ", cosineTable.shape) # optional cosineTable.columns = docIdList cosineTable.index = docIdList cosineTableDic = cosineTable.to_dict() #create empty dictionary #keywordsDic = {} #loop through dictionary #for docId in tfidfTableDic: # for tfIdf in value: #check if tfidf value is above threshold # if tfIdf >= 0.05: # keywordsDic= keywordsDic.keys(docId) filteredDic = {} filteredDic = functions.filterDic(tfidfTableDic, 0.05) with open("tfidfTableDic_filtered.txt", 'w', encoding='utf8') as f9: json.dump(filteredDic, f9, sort_keys=True, indent=4, ensure_ascii=False) filteredDic = {} filteredDic = functions.filterDic(cosineTableDic, 0.25) with open("cosineTableDic_filtered.txt", 'w', encoding='utf8') as f9: json.dump(filteredDic, f9, sort_keys=True, indent=4, ensure_ascii=False)
def processAll(pathToMemex): pathData = functions.dicOfRelevantFiles(memexPath, ".bib") print(pathData) for k, v in pathData.items(): generatePublicationInterface(k, v)
def processAllRecords(pathToMemex): files = functions.dicOfRelevantFiles(pathToMemex, ".bib") for citeKey, pathToBibFile in files.items(): if os.path.exists(pathToBibFile.replace(".bib", ".json")): generatePublicationInterface(citeKey, pathToBibFile)
def processAllRecords(pathToMemex): files = functions.dicOfRelevantFiles(pathToMemex, ".bib") for citeKey, pathToBibFile in files.items(): #print(citeKey) generatePublicationInterface(citeKey, pathToBibFile) generateMemexStartingPages(pathToMemex)
def tfidfPublications(pathToMemex, PageOrPubl): #create the tfidf-dictionary print("\tProcessing: %s" % PageOrPubl) #prints processing and page of the publication # PART 1: loading OCR files into a corpus ocrFiles = functions.dicOfRelevantFiles( pathToMemex, ".json" ) #generates a dictionary with citekeys as keys and paths to json-Files as values citeKeys = list(ocrFiles.keys()) #[:500] #creates a list with the citeKeys print("\taggregating texts into documents..." ) #prints to inform about the processing corpusDic = {} #creates an empty list for citeKey in citeKeys: #loops throught the citekeys docData = json.load(open(ocrFiles[citeKey], "r", encoding="utf8")) #loads the OCRed documents for page, text in docData.items(): #loops through the OCRed documents # text as a document if PageOrPubl == "publications": #if there is "publication" if citeKey not in corpusDic: #if there is not a citekey in the corpusDic corpusDic[citeKey] = [] #creates empty list for citekeys corpusDic[citeKey].append(text) #appends the citeKeys # page cluster as a document elif PageOrPubl == "pages": #if there are "pages" pageNum = int(page) #returns the integrer page from the docs citeKeyNew = "%s_%05d" % ( citeKey, roundUp(pageNum, clusterSize) ) #creates a new cite key with page numbers and cluster Size if citeKeyNew not in corpusDic: #if it is not in the dictionary corpusDic[citeKeyNew] = [] #create a new dictionary corpusDic[citeKeyNew].append( text) #append the text to citekeynew # add the last page of cluster N to cluster N+1 if pageNum % clusterSize == 0: #if the page number cluser size is 0 citeKeyNew = "%s_%05d" % ( citeKey, roundUp(pageNum + 1, clusterSize) ) #creates a new cite key with page numbers +1 and cluster Size if citeKeyNew not in corpusDic: #of citekeynew is not in the dictionary corpusDic[citeKeyNew] = [ ] #create a new corpusDic dictionary corpusDic[citeKeyNew].append(text) #append the text to it else: sys.exit( "`PageOrPubl` parameter must be `publications` or `pages`" ) #if not, exit the program print("\t%d documents (%s) generated..." % (len(corpusDic), PageOrPubl) ) #print documents are generated and pages of the publication print("\tpreprocessing the corpus...") #print processing of the corpus docList = [] #create an empty dictionary docIdList = [] #create an empty list for the citeKe for docId, docText in corpusDic.items(): if len( docText ) > 2: # cluster of two pages mean that we would drop one last page doc = " ".join(docText) #take the text of each publication # clean doc doc = re.sub(r'(\w)-\n(\w)', r'\1\2', doc) doc = re.sub('\W+', ' ', doc) doc = re.sub('_+', ' ', doc) doc = re.sub('\d+', ' ', doc) doc = re.sub( ' +', ' ', doc ) #clean your content with the help of regular expressions, especially remove unneccessary blanks and signs) # we can also drop documents with a small number of words # (for example, when there are many illustrations) # let's drop clusters that have less than 1,000 words (average for 6 pages ±2500-3000 words) if len(doc.split(" ")) > 1000: # update lists docList.append( doc ) #add the content of each publication to the first list docIdList.append( docId ) #add the citeKey of each publication to the second key # PART 3: calculate tfidf for all loaded publications and distances print("\tgenerating tfidf matrix & distances...") #print the phrase #stopWords = functions.loadMultiLingualStopWords(["eng", "deu", "fre", "spa"]) vectorizer = CountVectorizer( ngram_range=(1, 1), min_df=5, max_df=0.5, stop_words=stopwordsList ) #create a vectorizer (use only unigrams, use only words that appear in at least five documents, use only words that appear in less than half of all documents) countVectorized = vectorizer.fit_transform(docList) #create the vectors tfidfTransformer = TfidfTransformer(smooth_idf=True, use_idf=True) #adjust the transformer vectorized = tfidfTransformer.fit_transform( countVectorized) # generates a sparse matrix cosineMatrix = cosine_similarity( vectorized) #generate a matrix with cosine distance values # PART 4: saving TFIDF --- only for publications! if PageOrPubl == "publications": #if there is "publivation in PageorPubl print("\tsaving tfidf data...") #print phrase tfidfTable = pd.DataFrame(vectorized.toarray(), index=docIdList, columns=vectorizer.get_feature_names() ) #transform the matrix into a dataframe tfidfTable = tfidfTable.transpose( ) #transposes rows and columns for document and information print("\ttfidfTable Shape: ", tfidfTable.shape) #prints the dataframe shape tfidfTableDic = tfidfTable.to_dict( ) #creates a dictionary with the tfidf-values tfidfTableDicFilt = filterTfidfDictionary( tfidfTableDic, 0.05, "more" ) #previously defined function only for the tf-idf dictionar including only the tf-idf value higher than 0.05 pathToSave = os.path.join( pathToMemex, "results_tfidf_%s.dataJson" % PageOrPubl) #creates the filepath and filename with open(pathToSave, 'w', encoding='utf8') as f9: #opens pathToSave and writes into it json.dump( tfidfTableDicFilt, f9, sort_keys=True, indent=4, ensure_ascii=False ) #creates the json-File which saves your filtered tfidf dictionary # PART 4: saving cosine distances --- for both publications and page clusters print("\tsaving cosine distances data...") #prints the phrase cosineTable = pd.DataFrame( cosineMatrix) #the metrix transformed into dataframe print("\tcosineTable Shape: ", cosineTable.shape) #prints the cosineTable shape cosineTable.columns = docIdList #Takes the list with the citeKeys as columns cosineTable.index = docIdList #Takes the list with the CiteKeys as index cosineTableDic = cosineTable.to_dict( ) #creates a dictionary with the cosine similarity tfidfTableDicFilt = filterTfidfDictionary( cosineTableDic, 0.25, "more" ) #previously defined function meassuring cosine similarities dictionary including only publications with a cosine similarity value higher than 0.25 pathToSave = os.path.join( pathToMemex, "results_cosineDist_%s.dataJson" % PageOrPubl) #creates the filepath and the filename with open(pathToSave, 'w', encoding='utf8') as f9: json.dump( tfidfTableDicFilt, f9, sort_keys=True, indent=4, ensure_ascii=False ) #creates the json-File saving the filtered cosine similarities dict