def processS1File(filename): with open(filename, 'r') as content_file: content = content_file.read( ) # the original content of the file is read # to process a textual content # returns a 4-tupla (text changed, html text with highlighted changes, no changes report, HTML no changes report) result = _processContent(content) if result == None: _Print("no change") _saveFile( filename + ".s", content ) # store result without changes in a file with '.s' extension _saveFile( filename + ".s.html", content) # store result without changes in an HTML report file else: _Print("some changes") _saveFile( filename + ".s", result[0] ) # store result with changes in a file with '.s' extension _saveFile( filename + ".s.html", result[1]) # store result with changes in an HTML report file # store results reporting studied changes finally not done if result[2] != "": _saveFile(filename + ".s.nr", result[2]) _saveFile(filename + ".s.nr.html", result[3]) return
def sharedSubjectsJaccardSimilarity (self, fileNameCandidateSubjects): try: # try to read candidate text subjects from local DB with _Open(fileNameCandidateSubjects) as fp: candidate_text_subjects = fp.read().splitlines() except Exception as e: _Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects) _appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e)) return -1 if len(candidate_text_subjects) == 0: return 0 # the subjects lists for both texts are now available subjects_jaccard_similarity = 0 try: # change every candidate subject by the pair (subject, list of subject components) pairs_candidate_text_subjects = list(map(lambda x: (x, _getSubjectComponents(x)), candidate_text_subjects)) numContributions=0 # number of matches - contributions with some similarity sum_sims = 0 # to aggregate similarities contributions for (sbo,sbocl) in self.pairs_original_text_subjects: for (sbc,sbccl) in pairs_candidate_text_subjects: min_long = min(len(sbocl), len(sbccl)) # length of the shorter subject if (min_long < 3): # both subjects must have at least 3 components continue intersection_cardinality = len(set.intersection(set(sbocl), set(sbccl))) # for the shorter subject, we require at most 1 component not to be included in the larger subject if (intersection_cardinality < (min_long - 1)): continue # this fulfills the requirements: it is a contribution numContributions += 1 union_cardinality = len(set.union(set(sbocl), set(sbccl))) component_jaccard_similarity = intersection_cardinality/float(union_cardinality) sum_sims += component_jaccard_similarity _Print(numContributions, "->", sbo, ",", sbc, component_jaccard_similarity) if numContributions == 0: # no intersection at all return 0 subjects_jaccard_similarity = sum_sims / numContributions except Exception as e: _Print("ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e)) _appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e)) return -1 if subjects_jaccard_similarity > 1: _Print("Candidate with subjects similarity > 1:", fileNameCandidateSubjects, sum_sims, denominator, subjects_jaccard_similarity) _appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): similarity > 1") return -1 return subjects_jaccard_similarity
def fullSubjectsJaccardSimilarity (self, fileNameCandidateSubjects): try: # try to read candidate text subjects from local DB with _Open(fileNameCandidateSubjects) as fp: candidate_text_subjects = fp.read().splitlines() except Exception as e: _Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects) _appendFile(self.logFilename, "ERROR fullSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e)) return -1 if len(self.original_text_subjects) == 0 or len(candidate_text_subjects) == 0: return 0 subjects_jaccard_similarity = self.oMeasures.oJaccardSimilarity(self.original_text_subjects, candidate_text_subjects) return subjects_jaccard_similarity
def processS4List(fileList, foldername): print("\nS4: Processing list of .w files ") if not foldername.endswith("/"): foldername = foldername+"/" t_folder = foldername + _T_FOLDER if not os.path.exists(t_folder): os.makedirs(t_folder) numFiles = 0 numProcessed = 0 for wFullFilename in fileList: if not wFullFilename.endswith(".w"): continue numFiles += 1 _Print(numFiles, "**************** Processing file ", wFullFilename) final_name = wFullFilename[(1+wFullFilename.rfind("/")):] tFullFilename = t_folder+final_name+".t" if os.path.exists(tFullFilename): _Print("T file already available in local DB: "+tFullFilename) continue _Print("Creating .t file: "+tFullFilename) result = processOneFile(wFullFilename) pickle.dump(result, open(tFullFilename, "wb" )) numProcessed += 1 return numProcessed
def processS3List(fileList): print("\nS3: Processing list of .s files") numFiles = 0 numProcessed = 0 for sFullFilename in fileList: if not sFullFilename.endswith(".s"): continue numFiles += 1 _Print(numFiles, "**************** Processing file ", sFullFilename) if os.path.exists(sFullFilename + ".w"): _Print("W file already available in local DB: " + sFullFilename + ".w") continue _Print("Creating .w file: " + sFullFilename + ".w") pfullfilename = sFullFilename + ".p" result = getContentAfterChanges(sFullFilename, pfullfilename) # save result in files with the same name and extension '.w' _saveFile(sFullFilename + ".w", result[0]) _saveFile(sFullFilename + ".w.html", result[1]) highlightedContent = _getContentMarked(sFullFilename + ".w", "w") _saveFile(sFullFilename + ".w.p.html", highlightedContent) numProcessed += 1 return numProcessed
def generateAgregate(foldername): print("\nCreating aggregation " + foldername + ".s...") numFiles = 0 global_content = "" spw_folder = foldername + _SPW_FOLDER if not os.path.exists(spw_folder): print(spw_folder, "not found!") return -1 for filename in sorted(os.listdir( spw_folder)): # files are ordered by alphabetic file name if not filename.endswith(".s"): # only .s files are joined continue else: numFiles += 1 _Print(numFiles, "====================", filename) with open(spw_folder + "/" + filename, 'r') as content_file: content = content_file.read() global_content += content _saveFile(foldername + "/" + foldername + ".s", global_content) return
def processS2File(source, confidence=0.5, support=1): if not source.endswith(".s"): message = source + " has not '.s' extension" print(message) raise Exception(message) if not os.path.exists(source): message = source + " not found!" print(message) raise Exception(message) _Print("Processing file " + source + "...\n") try: entities = findEntities(source, confidence, support) pickle.dump(entities, open(source + ".p", "wb")) highlightedContent = _getContentMarked(source, 's') _saveFile(source + ".p.html", highlightedContent) except Exception as e: message = "Problem detecting entities: " + str(e) print(message) raise Exception(message) return 0
def findEntities(filename, confPar, supPar): content_file = open(filename, 'r') content = content_file.read() # DB-SL is queried for teh preferred entity for each candidate detected in the file # see section 6.4 of the document describing the architecture for the formats of request and answer dbsl_response = requests.post(_URL_DB_SL_annotate, data={ "text": content, "confidence": confPar, "support": supPar }, headers={ "accept": "application/json", "content-type": "application/x-www-form-urlencoded" }) if (dbsl_response.status_code != 200): raise Exception("DBpedia SpotLight connection error: " + _URL_DB_SL_annotate) # the previous one is a synchronous call, anly returns after receiving the answer, that will be passed now to JSON try: dbsl_json = dbsl_response.json() dbsl_json[ "Resources"] # if no entity is detected an exception is raised except: _Print("No entity detected in the file") return {'byUri': {}, 'byType': {}, 'byOffset': {}} _Print("Detected", len(dbsl_json["Resources"]), "entities") # create class _DBManager to parse results dbpediaManager = _DBManager() dbpediaManager.scanEntities(dbsl_json) allDicts = dbpediaManager.getDictionaries() byUri = allDicts["byUri"] byType = allDicts["byType"] byOffset = allDicts["byOffset"] byuriplana = [item for sublist in byUri.values() for item in sublist] _Print(len(byUri.keys()), len(byuriplana), len(byType.keys()), len(byOffset.keys())) return allDicts
def processS2List(fileList, confidence=0.5, support=1): print("\nS2: Processing list of .s files") numFiles = 0 numProcessed = 0 for sFullFilename in fileList: if not sFullFilename.endswith(".s"): continue numFiles += 1 _Print(numFiles, " S2: Processing file ", sFullFilename) if os.path.exists(sFullFilename + ".p"): _Print("P file already available in local DB: " + sFullFilename + ".p") continue _Print("Creating .p file: " + sFullFilename + ".p") try: entities = findEntities(sFullFilename, confidence, support) except Exception as ex: print( "processS2List: findEntities raised exception (" + str(ex) + ")! Could not process ", sFullFilename) input("Continue?") continue time.sleep(2) pickle.dump(entities, open(sFullFilename + ".p", "wb")) highlightedContent = _getContentMarked(sFullFilename, "s") _saveFile(sFullFilename + ".p.html", highlightedContent) numProcessed += 1 return numProcessed
def processS1List(foldername, fileList): print("\nS1: Processing list of .txt files to folder " + foldername) if not foldername.endswith("/"): foldername = foldername + "/" if not os.path.exists( foldername ): # create CORPUS folder for output files if does not exist os.makedirs(foldername) # create the folder to store output files if does not exist spw_folder = foldername + _SPW_FOLDER if not os.path.exists(spw_folder): os.makedirs(spw_folder) numFiles = 0 numProcessed = 0 for filename in fileList: numFiles += 1 _Print(numFiles, " S1: Processing file ", filename) final_name = filename[(1 + filename.rfind("/")):] basename = spw_folder + final_name if os.path.exists(basename + ".s"): # this is case insensitive _Print("S file already available in local DB: " + basename + ".s") continue _Print("Creating .s file: " + basename + ".s") with open(filename, 'r') as content_file: content = content_file.read( ) # the original content of file is read # to process a textual content # returns a tupla (text changed, html text with highlighted changes, no changes report, HTML no changes report) result = _processContent(content) if result == None: _Print("no change, saving .s", basename + ".s") _saveFile( basename + ".s", content ) # store result without changes in a file with '.s' extension _saveFile( basename + ".s.html", content ) # store result without changes in an HTML report file else: _Print("changes, saving .s", basename + ".s") _saveFile( basename + ".s", result[0] ) # store result with changes in a file with '.s' extension _saveFile(basename + ".s.html", result[1] ) # store result with changes in an HTML report file # store results reporting studied changes finally not done if result[2] != "": _saveFile(basename + ".s.nr", result[2]) _saveFile(basename + ".s.nr.html", result[3]) numProcessed += 1 return numProcessed
def processContent(content): # this is an offset dict, keys are offsets, values the detected matches (object with name and EVENT) in that offset wordsWithNumber = {} # this is a dict of names, keys are name of matches, values are sets with all the different EVENTS involving such name allSubstitutions = {} # first pass to content, to detect and store EVENTS # match is every case finding EVENTS for match in reg_WordWithRomanNumber.finditer(content): offset = match.start() # the position of the EVENT # match.groups provides the different parts of the match tuplaGroups1 = match.groups( )[:-1] # remove the non-alphanumeric char ending the sequence tuplaGroups2 = (tuplaGroups1[0], " ") + tuplaGroups1[1:] # add a ' ' after the name newFullWord = "".join( tuplaGroups2 ) # join all parts, without spaces, result is "name romannumber" _Print(match.groups(), " --> ", tuplaGroups2, " --> ", newFullWord) word = match.group(1) # the name # add the match to the dict of offsets, key the offset, value an object with the name (word) and the match (fullWord) wordsWithNumber[offset] = { "fullWord": newFullWord.strip(), "word": word } # strip removes spaces before and after the string # add the EVENT to the dict of names, a set with all the EVENT of such name if word not in allSubstitutions: allSubstitutions[word] = {newFullWord} else: allSubstitutions[word].add(newFullWord) _Print('\n-------- Results of the first pass --------') _Print("There are", len(wordsWithNumber), "EVENTS, proper names followed by an space and a roman number") # EVENTS have been detected. Let's go with changes # second pass to the content, to make modifications (changes after EVENT or before for unique NAMEs) # we study the content from offset to offset, and make modifications depending on current substitutions offsets = list(wordsWithNumber.keys()) # list of offsets of every EVENT offsets.sort() # ascending order negReportTxt = "" # text negative report (transformations not done) negReportHtml = "" # html negative report if len(offsets) == 0: return None # dict to storecurrent substitutions, key is the name, value is object with the EVENT and its offset (not necessary) currentSubstitutions = {} # start with the initial text, before the first EVENT initial = content[0:offsets[0]] # convert the string to a list of words separated by non-alphanumeric chars # with the parenthesis, the detected groups marking the word separation are also returned, that is, # words is a list contaning everything in the string, the words and, between them, the non-alphanumeric groups separating them words = re.split('(\W+)', initial) wordsHTML = re.split('(\W+)', initial) # the same for the HTML result for j in range(0, len(words)): if words[j] in allSubstitutions: if len(allSubstitutions[words[j]]) == 1: sustituto = list(allSubstitutions[words[j]])[ 0] # to get the only one set member if changeAccordingContext(words, j): # change is done words[j] = sustituto wordsHTML[ j] = "<span style='color: green'><b>" + sustituto + "</b></span>" else: # change is not done, and it is annotated in the negative report negReportTxt += "Before: (" + sustituto + ") --> **" negReportTxt += buildSecureReject(words, j)[0] + "**\n" negReportHtml += "Before: (" + sustituto + ") --> **" negReportHtml += buildSecureReject(words, j)[1] + "**<p>" # 'finalContent' contains the text after processing and changes finalContent = "".join(words) finalContentHTML = "".join(wordsHTML).replace("\n", "<p>") # let's go with rest of text after the first offset for i in range(0, len(offsets)): # range(0,n) goes from 0 to n-1 o = wordsWithNumber[offsets[ i]] # the object with the i event, we study the text from here to the following # add the event to the final content and to the dict of current substituions finalContent += o["fullWord"] finalContentHTML += o["fullWord"] currentSubstitutions[o["word"]] = { "sub": o["fullWord"], "offset": offsets[i] } # if that word already existed, it is changed by the new one # search the limit of the text fragment, from teh current event to the following one (or teh end of text) if i + 1 == len(offsets): limit = len( content ) # this was the last event, limit is the length of the original text else: limit = offsets[i + 1] # limit is the offset of the following event # take the string from the current offset + len(added) to the limit currentSubstring = content[offsets[i] + len(o["fullWord"]):limit] # convert such string to a list of words separated by non-alphanumeric chars # with the parenthesis, the detected groups marking the word separation are also returned, that is, # words is a list contaning everything in the string, the words and, between them, the non-alphanumeric groups separating them words = re.split('(\W+)', currentSubstring) wordsHTML = re.split('(\W+)', currentSubstring) # study each one of the words in list, that is, of the string after the current event to limit for j in range(0, len(words)): # if current word is in the list of substitutions, change it if words[j] in currentSubstitutions: sustituto = currentSubstitutions[words[j]]["sub"] if changeAccordingContext(words, j): # change is done words[j] = sustituto wordsHTML[ j] = "<span style='color: green'><b>" + sustituto + "</b></span>" else: # change is not done, and it is annotated in the negative report negReportTxt += "After: (" + sustituto + ") --> **" negReportTxt += buildSecureReject(words, j)[0] + "**\n" negReportHtml += "After: (" + sustituto + ") --> **" negReportHtml += buildSecureReject(words, j)[1] + "**<p>" else: if words[j] in allSubstitutions: if len(allSubstitutions[words[j]]) == 1: sustituto = list(allSubstitutions[words[j]])[0] if changeAccordingContext(words, j): # change is done words[j] = sustituto wordsHTML[ j] = "<span style='color: green'><b>" + sustituto + "</b></span>" else: # change is not done, and it is annotated in the negative report negReportTxt += "Before: (" + sustituto + ") --> **" negReportTxt += buildSecureReject(words, j)[0] + "**\n" negReportHtml += "Before: (" + sustituto + ") --> **" negReportHtml += buildSecureReject(words, j)[1] + "**<p>" # rebuild the studied fragment from the list of words, and add it to the final content finalContent += "".join(words) finalContentHTML += "".join(wordsHTML).replace("\n", "<p>") return (finalContent, finalContentHTML, negReportTxt, negReportHtml)
def getCategoriesInText(texto): import requests from px_aux import URL_DB_SL_annotate as _URL_DB_SL_annotate result = {} # make a query to DB-SL with texto try: objParams = {"text": texto, "confidence": 0.5, "support": 1} # annotateTextRequest = requests.get(_URL_DB_SL_annotate, params=objParams, headers={"Accept": "application/json"}) annotateTextRequest = requests.post( _URL_DB_SL_annotate, data=objParams, headers={"Accept": "application/json"}) except Exception as e: print("ERROR getCategoriesInText(): Problem querying DB-SL", str(e)) result["error"] = "Problem querying DB-SL --> " + str(e) return result try: dbpediaText = annotateTextRequest.json() except Exception as e: print("ERROR getCategoriesInText(): Problem jsoning DB-SL response:", annotateTextRequest.content) result[ "error"] = "Problem with json DB-SL response: the query does not return the expected JSON --> " + str( e) return result dbpediaManager = DBManager() try: dbpediaManager.scanEntities(dbpediaText) except Exception as e: print( "ERROR getCategoriesInText(): Problem scanning DB-SL response: error in scanEntities --> " + str(e)) result[ "error"] = "Problem with DB-SL: error in scanEntities --> " + str( e) return result entities = dbpediaManager.getEntitiesAfterOffset(0) if len(entities) == 0: print("Warning getCategoriesInText(): No entities in text") print("\nInitially, there are entities:", len(entities)) for entity in entities: _Print(entity["@URI"]) # filter duplicated entities (same entity identified in different parts of the text) uniqueEntities = [] for entity in entities: if entity["@URI"] in list(map(lambda x: x["@URI"], uniqueEntities)): continue uniqueEntities.append(entity) entities = uniqueEntities print("\nBut unique entities: ", len(entities)) for entity in entities: _Print(entity["@URI"]) # filter entities probably erroneously identified # a right entity is required to share wikicats with some other entity in the set _Print("\nFiltering by wikicats sharing") rightEntities = [] for entity in entities: wki = entity["wikicats"] # wikicats of this entity wkic = [] # wikicats of all the entities in the set except this one for ej in entities: if entity["@URI"] == ej["@URI"]: continue wkic.extend(ej["wikicats"]) intersec = set(wkic).intersection(wki) # is there intersection? if len(intersec) > 0: rightEntities.append(entity) else: _Print("Discarded entity: ", entity["@URI"]) entities = rightEntities print("\nAfter the filtering by wikicat sharing there are:", len(entities)) for entity in entities: _Print(entity["@URI"]) # a right entity is required to share subjects with some other entity in the set _Print("\nFiltering by subject sharing") rightEntities = [] for entity in entities: sbi = entity["subjects"] # subjects of this entity sbic = [] # subjects of all entities except this for ej in entities: if entity["@URI"] == ej["@URI"]: continue sbic.extend(ej["subjects"]) intersec = set(sbic).intersection(sbi) # is there intersection? if len(intersec) > 0: rightEntities.append(entity) else: _Print("Discarded entity: ", entity["@URI"]) entities = rightEntities print("\nAfter the filtering by subject sharing there are", len(entities)) for entity in entities: _Print(entity["@URI"]) # to return the list of the wikicats of the entities identified in the text wikicats = [] for entity in entities: wikicats.extend(entity["wikicats"]) setWikicats = list(set(wikicats)) # removes duplicates result["wikicats"] = setWikicats # from collections import Counter # counts = Counter(wikicats) # repetidas = [value for value, count in counts.items() if count > 1] # # print("\nwikicats repetidas = ", repetidas) # print("\nwikicats unicas = ", set(wikicats)-set(repetidas)) # to return the list of the subjects of the entities identified in the text subjects = [] for entity in entities: subjects.extend(entity["subjects"]) subjects = list(set(subjects)) # removes duplicates result["subjects"] = [ s.split(':')[-1] for s in subjects ] # original format = http://dbpedia.org/resource/Category:Ionian_Revolt, change to Ionian_Revolt # to return the list of URIs of the entities identified in the text uris = [] for entity in entities: listTypes = entity["combinedTypes"] if ("Person" in listTypes): uris.append(entity["@URI"]) continue if ("Location" in listTypes) or ("Place" in listTypes) or ( "City" in listTypes) or ("Country" in listTypes): uris.append(entity["@URI"]) continue if "Event" in listTypes: uris.append(entity["@URI"]) continue uris = list(set(uris)) # removes duplicates result["URIs_persons_places_events"] = uris return result
def getContentAfterChanges(sfilename, pfilename): finalContent = "" finalHTMLContent = "" sfile = open(sfilename, 'r') if not os.path.isfile(pfilename): print("getContentAfterChanges: " + pfilename + " not found!!") input("Continue?") return (finalContent, finalHTMLContent) pfile = open(pfilename, 'rb') content = sfile.read() dicsEntities = pickle.load(pfile) currentPosition = 0 # marks the position in the original file offsets = list(dicsEntities["byOffset"].keys()) if offsets == []: return (content, content) # new offset for every entity identified in the .w file, it is necessary to correct it wrt the .s as we change the text of the file nuevoOffset = 0 # marks the position in the result file # new dict byOffset with the offset updates. NECESSARY?? may be it is possible to update directly in the old one newByOffset = {} # iteration follows the input order in dict, that it is the offset one from low to high for i in range(len(offsets)): o = offsets[i] entity = dicsEntities["byOffset"][o] if o != entity["@offset"]: _Print( o, "the offset index is different from the one included in the entity" ) sf = entity["@surfaceForm"] nameEntity = entity["entityName"] text = content[currentPosition:int(o)] currentPosition += len(text) finalContent += text nuevoOffset += len(text) entity["@offset"] = nuevoOffset # update offset # entity["@surfaceForm"] = nameEntity # no actualizamos la surfaceForm, para conservarla. El ancla en el texto debe ser a partir de ahora entity["entityName"] newByOffset[nuevoOffset] = entity # and save it in the new dict finalHTMLContent += text.replace("\n", "\n<br>") finalContent += nameEntity # the entity name is copied in the output file nuevoOffset += len(nameEntity) # in the HTML file, write in blue if not modified, and in striked blue and after in green if modified if sf == nameEntity: finalHTMLContent += "<span style='color: blue'><b>" + nameEntity + "</b></span>" else: finalHTMLContent += "<span style='color: blue; text-decoration:line-through'>" + sf + "</span> <span style='color: green'><b>" + nameEntity + "</b></span>" # Now see how much to advance in the original file nameEntitySpaced = nameEntity.replace( "_", " ") # divide the entity name in words # if equal, advance the length if sf == nameEntitySpaced: currentPosition += len(sf) else: # if the sf last word is not a prefix of the entity name, continue processing .s file from the end of the sf if not nameEntitySpaced.startswith(sf): currentPosition += len(sf) # if the sf last word is a prefix of the entity name, check if the following chars are in the entity name else: # nameEntitySpacedRemaining = nameEntitySpaced[len(sf):] # el resto del nombre de la entidad tras la surface form # nextContent = content[currentPosition+len(sf):currentPosition+len(sf)+80] # lo que viene a partir de la surface form en el fichero original # wordsSF = sf.split() # if len(wordsSF) > 1: # leadingSF = " ".join(wordsSF[0:-1])+" " # finalContent += leadingSF # currentPosition += len(leadingSF) nextContent = content[currentPosition:currentPosition + 80] if nextContent.startswith( nameEntitySpaced ): # if the following chars include the name of the entity, we jump it advanceTo = currentPosition + len(nameEntity) if i + 1 < len(offsets): if advanceTo > int(offsets[i + 1]): currentPosition += len(sf) else: currentPosition += len(nameEntity) else: currentPosition += len(sf) else: currentPosition += len(sf) dicsEntities["byOffset"] = newByOffset # substitute the new byOffset # update byUri and byType from the byOffset (nu, nt) = rebuild(newByOffset) dicsEntities["byUri"] = nu dicsEntities["byType"] = nt pickle.dump(dicsEntities, open(sfilename + ".w.p", "wb")) return (finalContent, finalHTMLContent)