def getEntityFile(): from px_aux import ORIGINAL_TEXTS_FOLDER as _ORIGINAL_TEXTS_FOLDER fich = request.values.get('file') sfile = _ORIGINAL_TEXTS_FOLDER + "/" + fich pfile = sfile + ".p" content_file = _Open(sfile, "rt", encoding='utf8') content = content_file.read() content_file.close() fileDataMod = pickle.load(_Open(pfile, "rb")) result = {'text': content, 'data': fileDataMod} return jsonify(result)
def getFile(): from px_aux import ORIGINAL_TEXTS_FOLDER as _ORIGINAL_TEXTS_FOLDER fich = request.values.get('file') content_file = _Open(_ORIGINAL_TEXTS_FOLDER + "/" + fich, "rt", encoding='utf8') content = content_file.read() content_file.close() if fich.endswith(".txt"): head = fich + ": fichero original" elif fich.endswith(".s.html"): head = fich + ": fichero donde se han añadido sufijos romanos a partir del contexto (<span style='color: green'>marcados en verde</span>)" elif fich.endswith(".s.nr.html"): head = fich + ": informe sobre las surface forms (<span style='color: red'>marcadas en rojo</span>) a las que se estudió añadir un sufijo romano y finalmente se descartó" elif fich.endswith(".s"): head = fich + ": fichero de partida, donde ya se han añadido sufijos romanos a partir del contexto" elif fich.endswith(".p.html"): head = fich + ": fichero de partida, donde se han marcado las entidades detectadas" elif fich.endswith(".w.html"): head = fich + ": fichero donde se han realizado las transformaciones de la surface form (<span style='color: blue'>en azul tachado</span>) a la última parte del URL de la entidad que le corresponde (<span style='color: green'>en verde</span>)" else: head = fich + ": fichero con propósito desconocido" if fich.endswith(".txt") or fich.endswith(".s"): content = content.replace("\n", "<p>") result = {'head': head, 'text': content} return jsonify(result)
def euclideanTextSimilarity (self, candidate_text=None, candidate_file=None): try: if not candidate_text: candidate_fileFD = _Open(candidate_file, "r") candidate_text = candidate_fileFD.read() list_of_text = [self.original_text, candidate_text] # Create a list of documents of the original text and the new candidate text vectorizer = CountVectorizer() # Create a CountVectorizer Object # Transform arbitrary data into numerical features # Description: remove stopwords, tokenize the text, create a vocabulary from distinct words, map each document to vocabulary (tokens) features = vectorizer.fit_transform(list_of_text).todense() # Measure the euclidean distance, returns an array with the euclidean distance euclideanDistances = euclidean_distances(features[0], features[1]) euclidean_distance = euclideanDistances[0][0] # between 0 and N, 0 is the best euclidean_similarity = 1 / (1 + euclidean_distance) # between 0 and 1, 1 is the best except Exception as e: print("** ERROR euclideanTextSimilarity:", str(e)) raise e return euclidean_similarity
def __iter__(self): print("MyCorpusBoth iteration", numIter) startTime = datetime.now() print("Generic files") for i,file in enumerate(listGeneric): if (i % 100) == 0: print(i, end=' ', flush=True) with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as fdata: text = fdata.read() # remove stopwords cText = Gensim_remove_stopwords(text) # preprocess the text (tokenize, lower, remove punctuation, remove <2 and >50 length words) wordsText = simple_preprocess(cText, max_len=50) # feed the caller yield TaggedDocument(words=wordsText, tags=dictTags[file]) print("\nAd hoc files") for file in listCorpusAH: fd = _Open(file, "r") text = fd.read() # remove stopwords cText = Gensim_remove_stopwords(text) # preprocess the text (tokenize, lower, remove punctuation, remove <2 and >50 length words) wordsText = simple_preprocess(cText, max_len=50) # feed the caller yield TaggedDocument(words=wordsText, tags=dictTags[file]) endTime = datetime.now() elapsedTime = endTime - startTime print(" Duración:", elapsedTime.seconds)
def setStoredQA(): if request.method == "POST": print(request.values) QAsString = request.values.get("value") listQATuplas = re.findall("(.*)\|(.*)", QAsString) pickle.dump(listQATuplas, _Open("storedQA.p", "wb")) return jsonify(listQATuplas)
def sharedSubjectsJaccardSimilarity (self, fileNameCandidateSubjects): try: # try to read candidate text subjects from local DB with _Open(fileNameCandidateSubjects) as fp: candidate_text_subjects = fp.read().splitlines() except Exception as e: _Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects) _appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e)) return -1 if len(candidate_text_subjects) == 0: return 0 # the subjects lists for both texts are now available subjects_jaccard_similarity = 0 try: # change every candidate subject by the pair (subject, list of subject components) pairs_candidate_text_subjects = list(map(lambda x: (x, _getSubjectComponents(x)), candidate_text_subjects)) numContributions=0 # number of matches - contributions with some similarity sum_sims = 0 # to aggregate similarities contributions for (sbo,sbocl) in self.pairs_original_text_subjects: for (sbc,sbccl) in pairs_candidate_text_subjects: min_long = min(len(sbocl), len(sbccl)) # length of the shorter subject if (min_long < 3): # both subjects must have at least 3 components continue intersection_cardinality = len(set.intersection(set(sbocl), set(sbccl))) # for the shorter subject, we require at most 1 component not to be included in the larger subject if (intersection_cardinality < (min_long - 1)): continue # this fulfills the requirements: it is a contribution numContributions += 1 union_cardinality = len(set.union(set(sbocl), set(sbccl))) component_jaccard_similarity = intersection_cardinality/float(union_cardinality) sum_sims += component_jaccard_similarity _Print(numContributions, "->", sbo, ",", sbc, component_jaccard_similarity) if numContributions == 0: # no intersection at all return 0 subjects_jaccard_similarity = sum_sims / numContributions except Exception as e: _Print("ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e)) _appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e)) return -1 if subjects_jaccard_similarity > 1: _Print("Candidate with subjects similarity > 1:", fileNameCandidateSubjects, sum_sims, denominator, subjects_jaccard_similarity) _appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): similarity > 1") return -1 return subjects_jaccard_similarity
def spacyTextSimilarity_calc (self, candidate_text=None, candidate_file=None): if not candidate_text: candidate_fileFD = _Open(candidate_file, "r") candidate_text = candidate_fileFD.read() candidate_text_doc_tokens = self.nlp(candidate_text) return self.compute_similarity_without_stopwords_punct(self.original_text_doc_tokens, candidate_text_doc_tokens)
def htmlFolderToText(self, folderPath): list_of_files = glob.glob(folderPath + "*.html") for html_file in list_of_files: file = _Open(html_file, "r") html = file.read() pageName, cleanedText = htmlToText(html)
def sharedSubjectsSimilarity(self, original_text_subjects, fileNameCandidateSubjects, logFilename): try: # try to read candidate text subjects from local store with _Open(fileNameCandidateSubjects) as fp: candidate_text_subjects = fp.read().splitlines() print("File already available in local DB:", fileNameCandidateSubjects) except: # fetch candidate text subjects if not in local store _appendFile( logFilename, "ERROR sharedSubjectsSimilarity(): Subjects file not available: " + fileNameCandidateSubjects) return -1 if len(candidate_text_subjects) == 0: _appendFile( logFilename, "ERROR sharedSubjectsSimilarity(): Subjects file empty: " + fileNameCandidateSubjects) return -1 # the subjects lists for both texts are now available try: # change every original subject by the pair (subject, list of subject components) NONSENSE to compute this every time pairs_original_text_subjects = list( map(lambda x: (x, _getSubjectComponents(x)), original_text_subjects)) # change every candidate subject by the pair (subject, list of subject components) pairs_candidate_text_subjects = list( map(lambda x: (x, _getSubjectComponents(x)), candidate_text_subjects)) sum_sims = 0 for (wko, wkocl) in pairs_original_text_subjects: for (wkc, wkccl) in pairs_candidate_text_subjects: wkc_jaccard_similarity = self.measures.oJaccardSimilarity( wkocl, wkccl) sum_sims += wkc_jaccard_similarity union_cardinality = len( set.union(set(original_text_subjects), set(candidate_text_subjects))) if union_cardinality == 0: # not possible, it should not be here if len(original_text_subjects) == 0 return -1 else: subjects_jaccard_similarity = sum_sims / union_cardinality except Exception as e: _appendFile( logFilename, "ERROR sharedSubjectsSimilarity(): Exception while computing Jaccard subjects similarity: " + e) return -1 return subjects_jaccard_similarity
def spacyTextSimilarity (self, candidate_text=None, candidate_file=None): if not candidate_text: candidate_fileFD = _Open(candidate_file, "r") candidate_text = candidate_fileFD.read() # Tokenize candidate text based on the spacy package candidate_text_doc_tokens_without_stopwords = self.nlp(self.remove_spacy_stopwords(candidate_text)) # Measure both texts similarity with spaCy method and return it return self.original_text_doc_tokens_without_stopwords.similarity(candidate_text_doc_tokens_without_stopwords)
def getContentMarked(filename, type): file = _Open(filename, 'r') content = file.read() pfilename = filename + ".p" if not os.path.isfile(pfilename): print("Does not exist " + pfilename) return content pfile = _Open(pfilename, 'rb') dics = pickle.load(pfile) dicOffsets = dics["byOffset"] finalHTMLContent = "" currentPosition = 0 # iteration follows the input order in the dictionary, that is supposed to be the offset order, increasing for k in dicOffsets: entity = dicOffsets[k] text = content[currentPosition:int(k)] currentPosition += len(text) finalHTMLContent += text.replace("\n", "\n<br>") urlEntity = entity["@URI"] if type == "s": name = entity["@surfaceForm"] else: name = entity["entityName"] finalHTMLContent += "<a href='" + urlEntity + "?lang=en'>" + name + "</a>" currentPosition += len(name) return finalHTMLContent
def __iter__(self): global numIter print("MyCorpusAH iteration", numIter) numIter = numIter + 1 for file in listCorpusAH: fd = _Open(file, "r") text = fd.read() # remove stopwords cText = Gensim_remove_stopwords(text) # preprocess the text (tokenize, lower, remove punctuation, remove <2 and >50 length words) wordsText = simple_preprocess(cText, max_len=50) # feed the caller yield TaggedDocument(words=wordsText, tags=dictTags[file])
def getTrainingTexts(): from pp_app import historicalDBpediaDataMod as _historicalDBpediaDataMod from px_aux import TEXTS_FOLDER as _TEXTS_FOLDER, DEFAULT_TRAINING_TEXTS as _DEFAULT_TRAINING_TEXTS result = {} for f in os.listdir(_TEXTS_FOLDER): #if f.endswith("w"): if f == _DEFAULT_TRAINING_TEXTS: with _Open(_TEXTS_FOLDER + "/" + f, "rt", encoding='utf8') as content_file: content = content_file.read() content_file.close() result[f] = {'text': content, 'data': _historicalDBpediaDataMod} return jsonify(result)
def fullSubjectsJaccardSimilarity (self, fileNameCandidateSubjects): try: # try to read candidate text subjects from local DB with _Open(fileNameCandidateSubjects) as fp: candidate_text_subjects = fp.read().splitlines() except Exception as e: _Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects) _appendFile(self.logFilename, "ERROR fullSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e)) return -1 if len(self.original_text_subjects) == 0 or len(candidate_text_subjects) == 0: return 0 subjects_jaccard_similarity = self.oMeasures.oJaccardSimilarity(self.original_text_subjects, candidate_text_subjects) return subjects_jaccard_similarity
def doc2VecTextSimilarity (self, candidate_text=None, candidate_file=None): if not candidate_text: candidate_fileFD = _Open(candidate_file, "r") candidate_text = candidate_fileFD.read() if (self.remove_stopwords == True): candidate_text = remove_stopwords(candidate_text) # Use gensim.utils.simple_preprocess for processing: # tokenize text to individual words, remove punctuations, set to lowercase, and remove words less than 2 chars or more than 50 chars candidate_text_tokens = simple_preprocess(candidate_text, max_len=50) # infer_vector(): Generates a vector from a document # The document should be tokenized in the same way the model's training documents were tokenized # The function may accept some optional parameters (alpha, min_alpha, epochs, steps) # infer_vector(doc_words, alpha=None, min_alpha=None, epochs=None, steps=None) # doc_words (list of str) – A document for which the vector representation will be inferred. # alpha (float, optional) – The initial learning rate. If unspecified, value from model initialization will be reused. # min_alpha (float, optional) – Learning rate will linearly drop to min_alpha over all inference epochs. If unspecified, value from model initialization will be reused. # epochs (int, optional) – Number of times to train the new document. Larger values take more time, but may improve quality and run-to-run stability of inferred vectors. If unspecified, the epochs value from model initialization will be reused. # steps (int, optional, deprecated) – Previous name for epochs, still available for now for backward compatibility: if epochs is unspecified but steps is, the steps value will be used. # Generate a vector from the tokenized candidate text candidate_text_inferred_vector = self.model.infer_vector(candidate_text_tokens, epochs=50) # The sklearn math functions returns an array with the results # We shall keep only one of them, either sklearn or ourSimilarityListsFunctions # Measure vectors similarity using cosine similarity # cos_similarity = cosine_similarity([original_text_inferred_vector], [text_inferred_vector]) # Measure vectors similarity using euclidean distance # euc_distance = euclidean_distances([original_text_inferred_vector], [text_inferred_vector]) # Measure vectors similarity using cosine similarity cos_similarity = self.ourMeasures.oCosineSimilarity(self.original_text_inferred_vector, candidate_text_inferred_vector) # Measure vectors similarity using euclidean distance # euc_distance = self.ourMeasures.oEuclideanDistance(self.original_text_inferred_vector, candidate_text_inferred_vector) # Measure vectors similarity using manhattan distance # man_distance = self.ourMeasures.oManhattanDistance(self.original_text_inferred_vector, candidate_text_inferred_vector) return cos_similarity
ipctg = int(pctg) MODEL_NAME = "/Users/agil/CloudStation/KORPUS/hibrido_"+TRCORPUS+"_"+pctg+".model" AH_CORPUS_FOLDER = "/Users/agil/CloudStation/KORPUS/SCRAPPED_PAGES/" GEN_FOLDER_BASE = "/Users/agil/Downloads/_corpus/" GEN_FOLDER = GEN_FOLDER_BASE+TRCORPUS+"/" numIter=0 # read the first 8% from 1926.ph5-3.simsBest.csv: the list of ad hoc candidates listBestAPFilename = "/Users/agil/CloudStation/KORPUS/1926/1926.ph5-3.simsBest.csv" listAP = [] try: with _Open(listBestAPFilename, 'r') as csvFile: reader = csv.reader(csvFile, delimiter=' ') next(reader) # to skip header for row in reader: listAP.append(AH_CORPUS_FOLDER+row[0]) csvFile.close() except Exception as e: print("Exception reading csvFile:", listBestAPFilename, str(e)) sizeAHCorpus = int(len(listAP) / 100) * ipctg # 8% of the total candidates listCorpusAH = listAP[:sizeAHCorpus] print("Size corpus AH =", sizeAHCorpus)
def getWikicatsFromText(): if request.method == "POST": originalText = request.values.get("text") len_text = len(originalText) # length of the received text if not os.path.exists( _CORPUS_FOLDER): # create KORPUS folder if not exists os.makedirs(_CORPUS_FOLDER) filename = _CORPUS_FOLDER + "/" + str( len_text ) + ".txt" # save the received text with length.txt filename _saveFile(filename, originalText) filename_wk = _CORPUS_FOLDER + "/" + str( len_text) + ".wk" # filename for wikicats (length.wk) filename_sb = _CORPUS_FOLDER + "/" + str( len_text) + ".sb" # filename for subjects (length.sb) result = {} try: # open wikicats file if exists with _Open(filename_wk) as fp: listWikicats = fp.read().splitlines() result["wikicats"] = listWikicats except: # fetch wikicats if file does not exist yet result = _getCategoriesInText( originalText ) # function getCategoriesInText from px_DB_Manager.py if ("error" in result): # return error if could not fetch wikicats return jsonify(result) listWikicats = list( filter(_filterSimpleWikicats, result["wikicats"]) ) # remove simple wikicats with function from aux.py result[ "wikicats"] = listWikicats # update result wikicats to return _saveFile(filename_wk, '\n'.join(listWikicats) ) # save file (length.wk) with wikicats, one per line listSubjects = list( filter(_filterSimpleWikicats, result["subjects"]) ) # remove simple subjects with function from aux.py result[ "subjects"] = listSubjects # update result subjects to return _saveFile(filename_sb, '\n'.join(listSubjects) ) # save file (length.sb) with subjects, one per line for w in listWikicats: # compute components for every wikicat and add all of them to result wlc = _getWikicatComponents( w) # function getWikicatComponets from aux.py result[w] = wlc # one entry per wikicat filename_selected = _CORPUS_FOLDER + "/" + str( len_text ) + ".selected.wk" # previously selected wikicats file for this text try: # try to open previously selected wikicats file if exists with _Open(filename_selected) as fp: wkSelectedList = fp.read().splitlines() except: wkSelectedList = [] # no previously selected wikicats result["formerSelectedWikicats"] = wkSelectedList return jsonify(result)
def saveFile(f, content): out = _Open(f, 'w') out.write(content) out.close() return
def getUrlsLinked2Wikicats(selectedWikicats, logFilename): requestObjects = {} # dictionary to store request objects _session = FuturesSession() # to manage asynchronous requests # first phase, reading files or start requests for DBpedia and Wikidata foreach wikicat for wikicat in selectedWikicats: # first, read or fetch Wikicat results for DBpedia filename_db = _URLs_FOLDER + "/_Wikicat_" + wikicat + "_DB_Urls.txt" requestDone = 0 # to control if some request has been done, and if so, set a delay to not overload servers try: # try to read wikicats of original text from local store with _Open(filename_db) as fp: urls_from_DB = fp.read().splitlines() print("File already available:", filename_db) requestObjects[wikicat] = { "dburls": urls_from_DB } # store the local available DB URLs for this wikicat except: # fetch data from DB fullWikicat = "Wikicat" + wikicat # asynchronous query to dbpedia # request only URLs being primaruy topic of some dbpedia entity queryDB = """ PREFIX yago: <http://dbpedia.org/class/yago/> SELECT ?url ?der ?pt WHERE { ?url rdf:type yago:""" + fullWikicat + """ . OPTIONAL {?url prov:wasDerivedFrom ?der} OPTIONAL {?url foaf:isPrimaryTopicOf ?pt} } """ # start the DB query try: print("Starting DB query for: ", wikicat) requestDB = _session.post( _URL_DB, data={"query": queryDB}, headers={"accept": "application/json"}) except Exception as exc: print( "*** ERROR getUrlsLinked2Wikicats(): Error starting DB query for", wikicat, ":", exc) _appendFile( logFilename, "ERROR getUrlsLinked2Wikicats(): Error starting DB query for " + wikicat + ": " + repr(exc)) requestDB = None requestObjects[wikicat] = { "db": requestDB } # store the request DB object for this wikicat requestDone = 1 # now, read or fetch Wikicat results for Wikidata filename_wk = _URLs_FOLDER + "/_Wikicat_" + wikicat + "_WK_Urls.txt" # it uses update with the objects dictionary, as the wikicat key has been already created for DBpedia wcs = _getWikicatComponents(wikicat) wcs_string = " ".join(wcs) try: # try to read wikicats and subjects of original text from local store with _Open(filename_wk) as fp: urls_from_WK = fp.read().splitlines() print("File already available:", filename_wk) requestObjects[wikicat].update({ "wkurls": urls_from_WK }) # store the local available WK URLs for this wikicat except: # fetch data from WK # asynchronous query to Wikidata queryWK = """ PREFIX wikibase: <http://wikiba.se/ontology#> PREFIX bd: <http://www.bigdata.com/rdf#> PREFIX mwapi: <https://www.mediawiki.org/ontology#API/> SELECT * WHERE { SERVICE wikibase:mwapi { bd:serviceParam wikibase:api 'Search' . bd:serviceParam wikibase:endpoint 'en.wikipedia.org' . bd:serviceParam mwapi:language "en" . bd:serviceParam mwapi:srsearch '""" + wcs_string + """' . ?title wikibase:apiOutput mwapi:title . } } """ # start the WK query try: print("Starting WK query for: ", wcs_string) requestWK = _session.post( _URL_WK, data={"query": queryWK}, headers={"accept": "application/json"}) except Exception as exc: print( "\n*** ERROR getUrlsLinked2Wikicats(): Error starting WK query for", wcs_string, ":", exc) _appendFile( logFilename, "ERROR getUrlsLinked2Wikicats(): Error starting WK query for " + wcs_string + ": " + repr(exc)) requestWK = None requestObjects[wikicat].update( {"wk": requestWK}) # store the request WK object for this wikicat requestDone = 1 if requestDone == 1: time.sleep(3) # delay to avoid server rejects for too many queries print("\n** ALL PENDING QUERIES LAUNCHED\n") # End of the first phase. All queries launched. Now, for every wikicat, we have: # requestObjects[wikicat] = {"dburls": URLs} or {"db": requestDB} # and {"wkurls": URLS} or {"wk": requestWK} # let's build an object {"db": urlsDB, "wk": urlsWK} for each wikicat (each field is a URL list) urlsObjects = {} # Second phase. Now, read the results received from all queries for wikicat in selectedWikicats: # first, study results for DB try: urlsDB = requestObjects[wikicat][ "dburls"] # try to recover local DB results except: requestDB = requestObjects[wikicat][ "db"] # no local DB results, get the request DB object for this wikicat if requestDB == None: # error starting DB query, return [] urlsDB = [] else: try: try: print("Waiting DB query result for:", wikicat) responseDB = requestDB.result( ) # waiting for DB query completion except: raise Exception("timeout") if responseDB.status_code != 200: # check if DB query ended correctly raise Exception("answer is not 200, is " + str(responseDB.status_code)) try: responseDBJson = responseDB.json() except: raise Exception("error decoding JSON") try: bindingsDB = responseDBJson["results"]["bindings"] except: raise Exception("no [results][bindings] in the answer") # remove bindings with no pt field (isPrimaryTopicOf), because they don't correspond to DBpedia entities ??? bindingsDBwithPT = list(filter(_hasFieldPT, bindingsDB)) urlsDB = list( map(lambda x: x["pt"]["value"], bindingsDBwithPT) ) # keep only the URL in x["pt"]["value"] if len(urlsDB) > 0: _saveFile( _URLs_FOLDER + "/_Wikicat_" + wikicat + "_DB_Urls.txt", '\n'.join(urlsDB) ) # save all results from DB for this wikicat else: print( "*** getUrlsLinked2Wikicats(): ", wikicat, " provided 0 DB URLs from " + str(len(bindingsDB)) + " results") _appendFile( logFilename, "getUrlsLinked2Wikicats(): " + wikicat + " provided 0 DB URLs from " + str(len(bindingsDB)) + " results") except Exception as exc: print( "*** ERROR getUrlsLinked2Wikicats(): Error querying DB for", wikicat, ":", exc) _appendFile( logFilename, "ERROR getUrlsLinked2Wikicats(): Error querying DB for " + wikicat + ": " + repr(exc)) urlsDB = [] # end for DB, we already have urlsDB # second, study results for WK wcs = _getWikicatComponents(wikicat) wcs_string = " ".join(wcs) try: urlsWK = requestObjects[wikicat][ "wkurls"] # try to recover local WK results except: requestWK = requestObjects[wikicat][ "wk"] # no local WK results, get the request WK object for this wikicat # WK results come without prefix "https://en.wikipedia.org/wiki/", this function adds it def addWKPrefix(x): return "https://en.wikipedia.org/wiki/" + x["title"][ "value"].replace(" ", "_") if requestWK == None: # error starting WK query, return [] urlsWK = [] else: try: try: print("Waiting WK query result for:", wikicat) responseWK = requestWK.result( ) # waiting for WK query completion except: raise Exception("timeout") if responseWK.status_code != 200: # check if WK query ended correctly raise Exception("answer is not 200, is " + str(responseWK.status_code)) try: responseWKJson = responseWK.json() except: raise Exception("error decoding JSON") try: bindingsWK = responseWKJson["results"]["bindings"] except: raise Exception("no [results][bindings] in the answer") urlsWK = list( map(addWKPrefix, bindingsWK) ) # add WK prefix to x["title"]["value"], changing space by '_' if len(urlsWK) > 0: _saveFile( _URLs_FOLDER + "/_Wikicat_" + wikicat + "_WK_Urls.txt", '\n'.join(urlsWK) ) # save all results from WK for this wikicat else: print("*** getUrlsLinked2Wikicats(): ", wikicat, " provided 0 WK URLs") _appendFile( logFilename, "getUrlsLinked2Wikicats(): " + wikicat + " provided 0 WK URLs") except Exception as exc: print( "*** ERROR getUrlsLinked2Wikicats(): Error querying WK for", wcs_string, ":", exc) _appendFile( logFilename, "ERROR getUrlsLinked2Wikicats(): Error querying WK for " + wcs_string + ": " + repr(exc)) urlsWK = [] # end for WK, we already have urlsWK # store results for this wikicat urlsObjects[wikicat] = {"db": urlsDB, "wk": urlsWK} print("\n** RECEIVED ALL RESULTS FOR PENDING QUERIES\n") return urlsObjects # return results to buildCorpus function
def buildCorpus2(): logFilename = "corpus.log" logFile = _Open(logFilename, "w") logFile.write(str(datetime.now()) + "\n") logFile.close() originalText = request.values.get( "text") # get parameter with original text lenOriginalText = len(originalText) selectedWikicats = json.loads( request.values.get("wikicats")) # get parameter with selected wikicats print("Number of selected wikicats:", len(selectedWikicats)) numUrlsDB = 0 numUrlsWK = 0 # store the selected wikicats in the file $CORPUS_FOLDER/length.selected.wk _saveFile(_CORPUS_FOLDER + "/" + str(lenOriginalText) + ".selected.wk", '\n'.join(selectedWikicats)) # read the original text subjects from local store filename_sb = _CORPUS_FOLDER + "/" + str( lenOriginalText) + ".sb" # filename for subjects (length.sb) try: with _Open(filename_sb) as fp: sbOriginalText = fp.read().splitlines() except: sbOriginalText = [] # no subjects for original text _appendFile(logFilename, "Subjects file not available: " + filename_sb) # Now, we have wikicats in 'selectedWikicats' and subjects in 'sbOriginalText' overwriteCorpus = json.loads( request.values.get("overwriteCorpus") ) # read the flag parameter overwriteCorpus from request if overwriteCorpus: # if overwriteCorpus, remove current corpus (URLs, scrapped pages and wikicats files) print("Deleting current URLs lists...") shutil.rmtree(_URLs_FOLDER) print("Deleting current scrapped texts...") shutil.rmtree(_SCRAPPED_TEXT_PAGES_FOLDER) # create the folder to store two files per wikicat, with the URLs linked to such wikicat coming from DB and WK # it must be done before calling the getUrlsLinked2Wikicats function, that it stores there files if fetched if not os.path.exists(_URLs_FOLDER): os.makedirs(_URLs_FOLDER) if not os.path.exists( _SCRAPPED_TEXT_PAGES_FOLDER ): # create the folder to store scrapped pages and wikicat files os.makedirs(_SCRAPPED_TEXT_PAGES_FOLDER) # now get the URLs associated to any of those wikicats (this function is below) # it reads from local files if exist, otherwise it connects to Internet to fetch them and store them locally urlsObjects = getUrlsLinked2Wikicats(selectedWikicats, logFilename) # it has been received a dictionary entry for each wikicat urlsObjects[wikicat] = {"db": urlsDB, "wk": urlsWK} # urlsDB and urlsWK are lists of URLs result = {} # object to store the results to be returned to the request fullList = [] # to aggregate the full list of URLs for all wikicats # process all results to return print("Number of URLs for every wikicat: ", end='') for wikicat in selectedWikicats: # first, the results from DB dbUrls = urlsObjects[wikicat]["db"] # get the set of DB URLs numUrlsDB += len(dbUrls) fullList.extend( dbUrls) # add the DB URLs of current wikicat to the whole list # now, the results from WK wkUrls = urlsObjects[wikicat]["wk"] numUrlsWK += len(wkUrls) fullList.extend(wkUrls) longs1 = "(DB=" + str(len(dbUrls)) + ", WK=" + str(len(wkUrls)) + ")" print(wikicat, longs1, end=', ') result[wikicat] = { "db": len(dbUrls), "wk": len(wkUrls) } # add results for this wikicat listWithoutDuplicates = list(set(fullList)) # remove duplicated URLs lenOfListWithoutDuplicates = len( listWithoutDuplicates) # length of full list to process print("\n\nSummary of URLs numbers: DB=", numUrlsDB, ", WK= ", numUrlsWK, ", total without duplicates=", lenOfListWithoutDuplicates) _appendFile( logFilename, "Number of discovered URLs: " + str(lenOfListWithoutDuplicates)) # returns number of results, the result items are only the numbers of discovered URLs result["totalDB"] = numUrlsDB result["totalWK"] = numUrlsWK result["totalUrls"] = len(listWithoutDuplicates) # return jsonify(result); # uncomment to return to the interface without processing files if aux.PSTOP == True: input("Type ENTER to continue...") ### We've got the first set of relevant URLs, available in listWithoutDuplicates, and stored in the URLs folder ### Let's start the analysis of their contents print("\n Downloading and cleaning candidate texts...") scrap = _scrapFunctions() # Create a scrapFunctions object to clean pages unretrieved_pages_list = [] # a list for unsuccessful pages retrieval nowDownloaded = 0 # number of files downloaded from Internet in this iteration listEnoughContent = [ ] # list of pages with sufficient content to proceed ( > _CORPUS_MIN_TXT_SIZE bytes, a constant from aux.py) listNotEnoughContent = [ ] # list of pages with insufficient content to proceed # download not locally stored pages, scrap them, and save them for idx, page in enumerate(listWithoutDuplicates, start=1): print("(", idx, "of", lenOfListWithoutDuplicates, ") -- ", page) # scrapped pages will be stored classified by domain, in specific folders # currently, only "en.wikipedia.org" domain is used pageWithoutHTTP = page[2 + page.find("//"):] # get the domain of this page domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")] if (not os.path.exists(_SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder) ): # create this domain folder if not exists os.makedirs(_SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder) # the pagename will be the name of the file, with the following change # dir1/dir2/page --> dir1..dir2..page.txt onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):] onlyPageChanged = onlyPage.replace("/", "..") # Add file extension '.txt' to page name for saving it !!!!!!!!!! # pageFinalName = page[1+page.rindex("/"):] fileNameCandidate = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged + ".txt" if (os.path.exists(fileNameCandidate)): print("File already available in local DB:", fileNameCandidate) fsize = os.path.getsize(fileNameCandidate) if fsize < _CORPUS_MIN_TXT_SIZE: listNotEnoughContent.append(page) else: listEnoughContent.append(page) else: # fetch file if not exists try: # Retrieves the URL, and get the page title and the scraped page content pageName, pageContent = scrap.scrapPage( page) # pageName result is not used nowDownloaded += 1 _saveFile(fileNameCandidate, pageContent) # Save to text file print("File", str(nowDownloaded), "downloaded and saved it:", fileNameCandidate) if (len(pageContent) < _CORPUS_MIN_TXT_SIZE): listNotEnoughContent.append(page) else: listEnoughContent.append(page) except Exception as exc: _appendFile( logFilename, "Page " + page + " could not be retrieved: " + repr(exc)) unretrieved_pages_list.append(page) # Save the unretrieved_pages_list to a file print("") print(str(len(unretrieved_pages_list)) + " unretrieved pages") _saveFile(_UNRETRIEVED_PAGES_FILENAME, '\n'.join(unretrieved_pages_list)) lenListEnoughContent = len(listEnoughContent) _appendFile( logFilename, "Number of available pages with enough content: " + str(lenListEnoughContent)) print("ALL PAGES AVAILABLE AND CLEANED.") print("New pages downloaded in this iteration:", str(nowDownloaded)) print("Number of pages with enough content:", str(lenListEnoughContent)) print("Number of pages without enough content:", str(len(listNotEnoughContent))) if aux.PSTOP == True: input("Type ENTER to continue...") # all the pages not already available have been now fetched and cleaned # # Create a new csv file if not exists. QUE SIGNIFICA W+ ? Temporalmente desactivado hasta que este claro lo que guardar # with _Open(_SIMILARITIES_CSV_FILENAME, 'w+') as writeFile: # # Name columns # fieldnames = ['URL', 'Euclidean Distance', 'Spacy', 'Doc2Vec Euclidean Distance', # 'Doc2Vec Cosine Similarity', 'Trained Doc2Vec Euclidean Distance', 'Trained Doc2Vec Cosine Similarity', # 'Wikicats Jaccard Similarity'] # # # Create csv headers # writer = csv.DictWriter(writeFile, fieldnames=fieldnames, delimiter=";") # # # Write the column headers # writer.writeheader() print("") print( "Identifying wikicats and subjects for candidate texts with DBpedia SpotLight..." ) currentDownloaded = 0 listWithWikicats = [] # list of pages with available wikicats listWithoutWikicats = [] # list of pages with no wikicats for idx, page in enumerate(listEnoughContent, start=1): print("\n(", idx, "of", lenListEnoughContent, ") -- ", page) # Build filenames for this page pageWithoutHTTP = page[2 + page.find("//"):] domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")] onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):] onlyPageChanged = onlyPage.replace("/", "..") fileNameCandidateBase = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged fileNameCandidate = fileNameCandidateBase + ".txt" fileNameCandidateWikicats = fileNameCandidateBase + ".wk" # wikicats file for this page fileNameCandidateSubjects = fileNameCandidateBase + ".sb" # subjects file for this page # if both files (wikicats and subjects) exists, use them from local store if os.path.exists(fileNameCandidateWikicats) and os.path.exists( fileNameCandidateSubjects): print("Files WK and SB already available in local DB for", fileNameCandidate) fwsize = os.path.getsize(fileNameCandidateWikicats) fssize = os.path.getsize(fileNameCandidateSubjects) # if one of these two files is empty (no wikicats or no subjects), this page will not be used if (fwsize == 0) or (fssize == 0): listWithoutWikicats.append(page) else: listWithWikicats.append(page) else: # if one file not exists, fetch from Internet candidate text wikicats and subjects try: # open and read text of candidate file candidateTextFile = _Open(fileNameCandidate, "r") candidate_text = candidateTextFile.read() print("Reading candidate text file:", fileNameCandidate) except: # file that inexplicably could not be read from local store, it will not be used _appendFile( logFilename, "ERROR buildCorpus2(): Unavailable candidate file, not in the store, but it should be: " + fileNameCandidate) listWithoutWikicats.append(page) continue print("Computing wikicats and subjects for:", page) candidate_text_categories = _getCategoriesInText( candidate_text ) # function _getCategoriesInText from px_DB_Manager if ("error" in candidate_text_categories ): # error while fetching info, the page will not be used _appendFile( logFilename, "ERROR buildCorpus2(): Problem in _getCategoriesInText(candidate_text): " + candidate_text_categories["error"]) listWithoutWikicats.append(page) continue print("Wikicats and subjects downloaded for", fileNameCandidate) candidate_text_wikicats = list( filter(_filterSimpleWikicats, candidate_text_categories["wikicats"]) ) # remove simple wikicats with function from aux.py candidate_text_subjects = list( filter(_filterSimpleSubjects, candidate_text_categories["subjects"]) ) # remove simple subjects with function from aux.py _saveFile(fileNameCandidateWikicats, '\n'.join(candidate_text_wikicats) ) # save file with original text wikicats, one per line _saveFile(fileNameCandidateSubjects, '\n'.join(candidate_text_subjects) ) # save file with original text subjects, one per line currentDownloaded += 1 # if no wikicats or no subjects, teh page will not be used if (len(candidate_text_wikicats) == 0) or (len(candidate_text_subjects) == 0): listWithoutWikicats.append(page) else: listWithWikicats.append(page) lenListWithWikicats = len(listWithWikicats) _appendFile( logFilename, "Number of available pages with wikicats and subjects: " + str(lenListWithWikicats)) print("") print("ALL WIKICATs AND SUBJECTs COMPUTED.") print("New items computed in this iteration:", str(currentDownloaded)) print("Number of pages with wikicats:", str(len(listWithWikicats))) print("Number of pages without wikicats:", str(len(listWithoutWikicats))) if aux.PSTOP == True: input("Type ENTER to continue...") print("\n Computing similarities...") discarded_pages_list = [] # a list to save discarded pages' URLs similarity = _textSimilarityFunctions( ) # Create a textSimilarityFunctions object to measure text similarities # variables to store results sims_wk_sb = [ ] # list of triplets (filenameCandidate, similarityByWikicats, similarityBySubjects) distribution_wk = { "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0 } distribution_sb = { "0": 0, "1": 0, "2": 0, "3": 0, "4": 0, "5": 0, "6": 0, "7": 0, "8": 0, "9": 0 } # Measure text similarity, and discard pages (discarded_pages_list) without a minimum similarity for idx, page in enumerate(listWithWikicats, start=1): print("(", idx, "of", lenListWithWikicats, ") -- ", page) # Build filename for this page pageWithoutHTTP = page[2 + page.find("//"):] domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")] onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):] onlyPageChanged = onlyPage.replace("/", "..") fileNameCandidateBase = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged fileNameCandidate = fileNameCandidateBase + ".txt" fileNameCandidateWikicats = fileNameCandidateBase + ".wk" fileNameCandidateSubjects = fileNameCandidateBase + ".sb" # try: # open and read local file if already exists # candidateTextFile = _Open(fileNameCandidate, "r") # pageContent = candidateTextFile.read() # print("Reading file:", fileNameCandidate) # except: # file that could not be downloaded # print("ERROR buildCorpus2(): Unavailable file, not in the store, but it should be:", fileNameCandidate) # input("ENTER to continue...") # continue # Compare original text with the text of this candidate (in pageContent) # several criteria are now computed. THEIR RELEVANCE SHOULD BE STUDIED AS SOON AS POSSIBLE # Measure text similarity based on the Lee doc2vec model # doc2vec_cosineSimilarity, doc2vec_euclideanDistance = similarity.doc2VecTextSimilarity(originalText, pageContent, _LEE_D2V_MODEL) # print("Lee Doc2Vec CS = "+str(doc2vec_cosineSimilarity)) # print("Lee Doc2Vec ED = "+str(doc2vec_euclideanDistance)) # # # Measure text similarity based on the trained doc2vec model with our training corpus # doc2vec_trained_cosineSimilarity, doc2vec_trained_euclideanDistance = similarity.doc2VecTextSimilarity(originalText, pageContent, _OWN_D2V_MODEL) # print("Trained Doc2Vec CS = "+str(doc2vec_trained_cosineSimilarity)) # print("Trained Doc2Vec ED = "+str(doc2vec_trained_euclideanDistance)) # # # Measure the euclidean distance using SKLEARN # euclidean_distance = similarity.euclideanTextSimilarity(originalText, pageContent) # print("Euclidean distance = "+str(euclidean_distance)) # # # Measure the spaCy distance # spacy_similarity = similarity.spacyTextSimilarity(originalText, pageContent) # print("Spacy similarity = "+str(spacy_similarity)) # Measure wikicats similarity (requires complete matching) # wikicats_jaccard_similarity, subjects_jaccard_similarity = similarity.fullWikicatsAndSubjectsSimilarity(originalText, pageContent) # print("Wikicats full jaccard similarity = "+str(wikicats_jaccard_similarity)) # print("Subjects full jaccard similarity = "+str(subjects_jaccard_similarity)) # Measure wikicats similarity (requires shared matching) shared_wikicats_jaccard_similarity = similarity.sharedWikicatsSimilarity( selectedWikicats, fileNameCandidateWikicats, logFilename) print("Wikicats shared jaccard similarity = " + str(shared_wikicats_jaccard_similarity)) shared_subjects_jaccard_similarity = similarity.sharedSubjectsSimilarity( sbOriginalText, fileNameCandidateSubjects, logFilename) print("Subjects shared jaccard similarity = " + str(shared_subjects_jaccard_similarity)) sims_wk_sb.append( (fileNameCandidate, shared_wikicats_jaccard_similarity, shared_subjects_jaccard_similarity)) # to compute distributions if shared_wikicats_jaccard_similarity == -1: _appendFile( logFilename, "ERROR computing sharedWikicatsJaccard: " + fileNameCandidateWikicats) else: if shared_wikicats_jaccard_similarity < 0.1: distribution_wk["0"] = distribution_wk["0"] + 1 elif shared_wikicats_jaccard_similarity < 0.2: distribution_wk["1"] = distribution_wk["1"] + 1 elif shared_wikicats_jaccard_similarity < 0.3: distribution_wk["2"] = distribution_wk["2"] + 1 elif shared_wikicats_jaccard_similarity < 0.4: distribution_wk["3"] = distribution_wk["3"] + 1 elif shared_wikicats_jaccard_similarity < 0.5: distribution_wk["4"] = distribution_wk["4"] + 1 elif shared_wikicats_jaccard_similarity < 0.6: distribution_wk["5"] = distribution_wk["5"] + 1 elif shared_wikicats_jaccard_similarity < 0.7: distribution_wk["6"] = distribution_wk["6"] + 1 elif shared_wikicats_jaccard_similarity < 0.8: distribution_wk["7"] = distribution_wk["7"] + 1 elif shared_wikicats_jaccard_similarity < 0.9: distribution_wk["8"] = distribution_wk["8"] + 1 else: distribution_wk["9"] = distribution_wk["9"] + 1 if shared_subjects_jaccard_similarity == -1: _appendFile( logFilename, "ERROR computing sharedSubjectsJaccard: " + fileNameCandidateSubjects) else: if shared_subjects_jaccard_similarity < 0.1: distribution_sb["0"] = distribution_sb["0"] + 1 elif shared_subjects_jaccard_similarity < 0.2: distribution_sb["1"] = distribution_sb["1"] + 1 elif shared_subjects_jaccard_similarity < 0.3: distribution_sb["2"] = distribution_sb["2"] + 1 elif shared_subjects_jaccard_similarity < 0.4: distribution_sb["3"] = distribution_sb["3"] + 1 elif shared_subjects_jaccard_similarity < 0.5: distribution_sb["4"] = distribution_sb["4"] + 1 elif shared_subjects_jaccard_similarity < 0.6: distribution_sb["5"] = distribution_sb["5"] + 1 elif shared_subjects_jaccard_similarity < 0.7: distribution_sb["6"] = distribution_sb["6"] + 1 elif shared_subjects_jaccard_similarity < 0.8: distribution_sb["7"] = distribution_sb["7"] + 1 elif shared_subjects_jaccard_similarity < 0.9: distribution_sb["8"] = distribution_sb["8"] + 1 else: distribution_sb["9"] = distribution_sb["9"] + 1 # # Save similarity to a CSV file # with _Open(_SIMILARITIES_CSV_FILENAME, 'a') as writeFile: # writer = csv.writer(writeFile, delimiter=';') # writer.writerow([page, euclidean_distance, spacy_similarity, doc2vec_euclideanDistance, # doc2vec_cosineSimilarity, doc2vec_trained_euclideanDistance, doc2vec_trained_cosineSimilarity, shared_wikicats_jaccard_similarity]) # Minimum similarity for a page to be accepted. # WE MUST DECIDE THE MOST RELEVANT CRITERIUM TO DECIDE ON IT # currently, we used shared_wikicats_jaccard_similarity min_similarity = 0.3 # review this threshold both_above_min = list( filter( lambda triple: ( (triple[1] > min_similarity) and (triple[2] > min_similarity)), sims_wk_sb)) _appendFile( logFilename, "Number of pages with both similarities above " + str(min_similarity) + " = " + str(len(both_above_min))) print("Number of pages with both similarities above", min_similarity, "=", len(both_above_min)) sims_wk_sb_str = list( map( lambda triple: (triple[0] + " " + str(triple[1]) + " " + str(triple[2])), sims_wk_sb)) _saveFile(_CORPUS_FOLDER + "/" + str(lenOriginalText) + ".sims", '\n'.join(sims_wk_sb_str)) result["distribution_wk"] = distribution_wk result["distribution_sb"] = distribution_sb # Save the discarded_pages_list to a file _saveFile(_DISCARDED_PAGES_FILENAME, '\n'.join(discarded_pages_list)) # print(str(len(discarded_pages_list)) + " discarded pages") # print distributions t0 = distribution_wk["0"] p0 = 100 * t0 / lenListWithWikicats t1 = distribution_wk["1"] p1 = 100 * t1 / lenListWithWikicats t1a = t0 + t1 p1a = 100 * t1a / lenListWithWikicats t2 = distribution_wk["2"] p2 = 100 * t2 / lenListWithWikicats t2a = t1a + t2 p2a = 100 * t2a / lenListWithWikicats t3 = distribution_wk["3"] p3 = 100 * t3 / lenListWithWikicats t3a = t2a + t3 p3a = 100 * t3a / lenListWithWikicats t4 = distribution_wk["4"] p4 = 100 * t4 / lenListWithWikicats t4a = t3a + t4 p4a = 100 * t4a / lenListWithWikicats t5 = distribution_wk["5"] p5 = 100 * t5 / lenListWithWikicats t5a = t4a + t5 p5a = 100 * t5a / lenListWithWikicats t6 = distribution_wk["6"] p6 = 100 * t6 / lenListWithWikicats t6a = t5a + t6 p6a = 100 * t6a / lenListWithWikicats t7 = distribution_wk["7"] p7 = 100 * t7 / lenListWithWikicats t7a = t6a + t7 p7a = 100 * t7a / lenListWithWikicats t8 = distribution_wk["8"] p8 = 100 * t8 / lenListWithWikicats t8a = t7a + t8 p8a = 100 * t8a / lenListWithWikicats t9 = distribution_wk["9"] p9 = 100 * t9 / lenListWithWikicats t9a = t8a + t9 p9a = 100 * t9a / lenListWithWikicats print("TOTAL WIKICATS = ", lenListWithWikicats) print("0: %6d - %8.2f - %8.2f" % (t0, p0, p0)) print("1: %6d - %8.2f - %8.2f" % (t1, p1, p1a)) print("2: %6d - %8.2f - %8.2f" % (t2, p2, p2a)) print("3: %6d - %8.2f - %8.2f" % (t3, p3, p3a)) print("4: %6d - %8.2f - %8.2f" % (t4, p4, p4a)) print("5: %6d - %8.2f - %8.2f" % (t5, p5, p5a)) print("6: %6d - %8.2f - %8.2f" % (t6, p6, p6a)) print("7: %6d - %8.2f - %8.2f" % (t7, p7, p7a)) print("8: %6d - %8.2f - %8.2f" % (t8, p8, p8a)) print("9: %6d - %8.2f - %8.2f" % (t9, p9, p9a)) t0 = distribution_sb["0"] p0 = 100 * t0 / lenListWithWikicats t1 = distribution_sb["1"] p1 = 100 * t1 / lenListWithWikicats t1a = t0 + t1 p1a = 100 * t1a / lenListWithWikicats t2 = distribution_sb["2"] p2 = 100 * t2 / lenListWithWikicats t2a = t1a + t2 p2a = 100 * t2a / lenListWithWikicats t3 = distribution_sb["3"] p3 = 100 * t3 / lenListWithWikicats t3a = t2a + t3 p3a = 100 * t3a / lenListWithWikicats t4 = distribution_sb["4"] p4 = 100 * t4 / lenListWithWikicats t4a = t3a + t4 p4a = 100 * t4a / lenListWithWikicats t5 = distribution_sb["5"] p5 = 100 * t5 / lenListWithWikicats t5a = t4a + t5 p5a = 100 * t5a / lenListWithWikicats t6 = distribution_sb["6"] p6 = 100 * t6 / lenListWithWikicats t6a = t5a + t6 p6a = 100 * t6a / lenListWithWikicats t7 = distribution_sb["7"] p7 = 100 * t7 / lenListWithWikicats t7a = t6a + t7 p7a = 100 * t7a / lenListWithWikicats t8 = distribution_sb["8"] p8 = 100 * t8 / lenListWithWikicats t8a = t7a + t8 p8a = 100 * t8a / lenListWithWikicats t9 = distribution_sb["9"] p9 = 100 * t9 / lenListWithWikicats t9a = t8a + t9 p9a = 100 * t9a / lenListWithWikicats print("TOTAL SUBJECTS = ", lenListWithWikicats) print("0: %6d - %8.2f - %8.2f" % (t0, p0, p0)) print("1: %6d - %8.2f - %8.2f" % (t1, p1, p1a)) print("2: %6d - %8.2f - %8.2f" % (t2, p2, p2a)) print("3: %6d - %8.2f - %8.2f" % (t3, p3, p3a)) print("4: %6d - %8.2f - %8.2f" % (t4, p4, p4a)) print("5: %6d - %8.2f - %8.2f" % (t5, p5, p5a)) print("6: %6d - %8.2f - %8.2f" % (t6, p6, p6a)) print("7: %6d - %8.2f - %8.2f" % (t7, p7, p7a)) print("8: %6d - %8.2f - %8.2f" % (t8, p8, p8a)) print("9: %6d - %8.2f - %8.2f" % (t9, p9, p9a)) return jsonify(result)
def appendFile(f, line): d = str(datetime.now()) fd = _Open(f, "a") fd.write(d + ": " + line + "\n") fd.close()
def appendFile(f, line): fd = _Open(f, "a") fd.write(line+"\n") fd.close()
# this program has been launched in the Plethora/buildCorpus folder # this is to search px_DB_Manager and px_aux in the Plethora folder # such modules are not needed here, but in routesCorpus and routesCorpus2 modules loaded next sys.path.append('../') # functions to be executed when Flask requests are received from routesCorpus import doPh1getWikicatsFromText as _doPh1getWikicatsFromText, doPh2getUrlsCandidateFiles as _doPh2getUrlsCandidateFiles from routesCorpus import getWikicatUrls as _getWikicatUrls from routesCorpus import doPh3downloadCandidateTexts as _doPh3downloadCandidateTexts, doPh4identifyWikicats as _doPh4identifyWikicats from routesCorpus import doPh5computeSimilarities as _doPh5computeSimilarities, doPh6trainD2V as _doPh6trainD2V, doPh7reviewCorpus as _doPh7reviewCorpus from aux_build import INITIAL_TEXT as _INITIAL_TEXT import aux_build import px_aux # load the initial text shown at the beginning of the interface initialTextFile = _Open(_INITIAL_TEXT, "r") initialText = initialTextFile.read() FLAB = False # to control if buttons must show additional label details (change to True if argument -l) # the following is only executed if this is the main program, that is, if we launch the corpus tool directly from the 'buildCorpus' folder # not executed if we launch the corpus tool from the main tool, as the 'app' object is already available from the main tool if __name__ == '__main__': import os # Flask is a module to launch a web server. It permits to map a function for each request template from flask import Flask, render_template, request, flash, json, jsonify, redirect, url_for, send_from_directory # templates dir is shared with the main tool because it is possible for this tool to be called from the main one template_dir = os.path.abspath('../templates') # Create the Flask app to manage the HTTP request