Exemple #1
0
	def fullSubjectsJaccardSimilarity (self, fileNameCandidateSubjects):

		try:  # try to read candidate text subjects from local DB
			with _Open(fileNameCandidateSubjects) as fp:
				candidate_text_subjects = fp.read().splitlines()
		except Exception as e:
			_Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects)
			_appendFile(self.logFilename, "ERROR fullSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e))
			return -1

		if len(self.original_text_subjects) == 0 or len(candidate_text_subjects) == 0:
			return 0

		subjects_jaccard_similarity = self.oMeasures.oJaccardSimilarity(self.original_text_subjects, candidate_text_subjects)

		return subjects_jaccard_similarity
Exemple #2
0
	def sharedSubjectsJaccardSimilarity (self, fileNameCandidateSubjects):

		try:  # try to read candidate text subjects from local DB
			with _Open(fileNameCandidateSubjects) as fp:
				candidate_text_subjects = fp.read().splitlines()
		except Exception as e:
			_Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects)
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e))
			return -1

		if len(candidate_text_subjects) == 0:
			return 0

		# the subjects lists for both texts are now available
		subjects_jaccard_similarity = 0

		try:
			# change every candidate subject by the pair (subject, list of subject components)
			pairs_candidate_text_subjects = list(map(lambda x: (x, _getSubjectComponents(x)), candidate_text_subjects))

			numContributions=0  # number of matches - contributions with some similarity
			sum_sims = 0  # to aggregate similarities contributions

			for (sbo,sbocl) in self.pairs_original_text_subjects:
				for (sbc,sbccl) in pairs_candidate_text_subjects:
					min_long = min(len(sbocl), len(sbccl)) # length of the shorter subject

					if (min_long < 3):  # both subjects must have at least 3 components
						continue

					intersection_cardinality = len(set.intersection(set(sbocl), set(sbccl)))

					# for the shorter subject, we require at most 1 component not to be included in the larger subject
					if (intersection_cardinality < (min_long - 1)):
						continue

					# this fulfills the requirements: it is a contribution

					numContributions += 1
					union_cardinality = len(set.union(set(sbocl), set(sbccl)))
					component_jaccard_similarity = intersection_cardinality/float(union_cardinality)
					sum_sims += component_jaccard_similarity
					_Print(numContributions, "->", sbo, ",", sbc, component_jaccard_similarity)

					if numContributions == 0: # no intersection at all
						return 0

					subjects_jaccard_similarity = sum_sims / numContributions
		except Exception as e:
			_Print("ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e))
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e))
			return -1

		if subjects_jaccard_similarity > 1:
			_Print("Candidate with subjects similarity > 1:", fileNameCandidateSubjects, sum_sims, denominator, subjects_jaccard_similarity)
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): similarity > 1")
			return -1

		return subjects_jaccard_similarity
Exemple #3
0
    def sharedSubjectsSimilarity(self, original_text_subjects,
                                 fileNameCandidateSubjects, logFilename):

        try:  # try to read candidate text subjects from local store
            with _Open(fileNameCandidateSubjects) as fp:
                candidate_text_subjects = fp.read().splitlines()
                print("File already available in local DB:",
                      fileNameCandidateSubjects)
        except:  # fetch candidate text subjects if not in local store
            _appendFile(
                logFilename,
                "ERROR sharedSubjectsSimilarity(): Subjects file not available: "
                + fileNameCandidateSubjects)
            return -1

        if len(candidate_text_subjects) == 0:
            _appendFile(
                logFilename,
                "ERROR sharedSubjectsSimilarity(): Subjects file empty: " +
                fileNameCandidateSubjects)
            return -1

        # the subjects lists for both texts are now available

        try:
            # change every original subject by the pair (subject, list of subject components)    NONSENSE to compute this every time
            pairs_original_text_subjects = list(
                map(lambda x: (x, _getSubjectComponents(x)),
                    original_text_subjects))

            # change every candidate subject by the pair (subject, list of subject components)
            pairs_candidate_text_subjects = list(
                map(lambda x: (x, _getSubjectComponents(x)),
                    candidate_text_subjects))

            sum_sims = 0
            for (wko, wkocl) in pairs_original_text_subjects:
                for (wkc, wkccl) in pairs_candidate_text_subjects:
                    wkc_jaccard_similarity = self.measures.oJaccardSimilarity(
                        wkocl, wkccl)
                    sum_sims += wkc_jaccard_similarity

            union_cardinality = len(
                set.union(set(original_text_subjects),
                          set(candidate_text_subjects)))

            if union_cardinality == 0:  # not possible, it should not be here if len(original_text_subjects) == 0
                return -1
            else:
                subjects_jaccard_similarity = sum_sims / union_cardinality
        except Exception as e:
            _appendFile(
                logFilename,
                "ERROR sharedSubjectsSimilarity(): Exception while computing Jaccard subjects similarity: "
                + e)
            return -1

        return subjects_jaccard_similarity
Exemple #4
0
def getUrlsLinked2Wikicats(selectedWikicats, logFilename):
    requestObjects = {}  # dictionary to store request objects

    _session = FuturesSession()  # to manage asynchronous requests

    # first phase, reading files or start requests for DBpedia and Wikidata foreach wikicat

    for wikicat in selectedWikicats:

        # first, read or fetch Wikicat results for DBpedia

        filename_db = _URLs_FOLDER + "/_Wikicat_" + wikicat + "_DB_Urls.txt"
        requestDone = 0  # to control if some request has been done, and if so, set a delay to not overload servers

        try:  # try to read wikicats of original text from local store
            with _Open(filename_db) as fp:
                urls_from_DB = fp.read().splitlines()
                print("File already available:", filename_db)
                requestObjects[wikicat] = {
                    "dburls": urls_from_DB
                }  # store the local available DB URLs for this wikicat
        except:  # fetch data from DB
            fullWikicat = "Wikicat" + wikicat

            # asynchronous query to dbpedia
            # request only URLs being primaruy topic of some dbpedia entity
            queryDB = """
			PREFIX yago: <http://dbpedia.org/class/yago/>
			SELECT ?url ?der ?pt WHERE {
				?url  rdf:type yago:""" + fullWikicat + """ .
				OPTIONAL {?url  prov:wasDerivedFrom ?der}
				OPTIONAL {?url  foaf:isPrimaryTopicOf ?pt}
			}
			"""

            # start the DB query
            try:
                print("Starting DB query for: ", wikicat)
                requestDB = _session.post(
                    _URL_DB,
                    data={"query": queryDB},
                    headers={"accept": "application/json"})
            except Exception as exc:
                print(
                    "*** ERROR getUrlsLinked2Wikicats(): Error starting DB query for",
                    wikicat, ":", exc)
                _appendFile(
                    logFilename,
                    "ERROR getUrlsLinked2Wikicats(): Error starting DB query for "
                    + wikicat + ": " + repr(exc))
                requestDB = None

            requestObjects[wikicat] = {
                "db": requestDB
            }  # store the request DB object for this wikicat
            requestDone = 1

        # now, read or fetch Wikicat results for Wikidata

        filename_wk = _URLs_FOLDER + "/_Wikicat_" + wikicat + "_WK_Urls.txt"

        # it uses update with the objects dictionary, as the wikicat key has been already created for DBpedia

        wcs = _getWikicatComponents(wikicat)
        wcs_string = " ".join(wcs)

        try:  # try to read wikicats and subjects of original text from local store
            with _Open(filename_wk) as fp:
                urls_from_WK = fp.read().splitlines()
                print("File already available:", filename_wk)
                requestObjects[wikicat].update({
                    "wkurls": urls_from_WK
                })  # store the local available WK URLs for this wikicat
        except:  # fetch data from WK

            # asynchronous query to Wikidata
            queryWK = """
			PREFIX wikibase: <http://wikiba.se/ontology#>
			PREFIX bd: <http://www.bigdata.com/rdf#>
			PREFIX mwapi: <https://www.mediawiki.org/ontology#API/>
			SELECT * WHERE {
				SERVICE wikibase:mwapi {
					bd:serviceParam wikibase:api 'Search' .
					bd:serviceParam wikibase:endpoint 'en.wikipedia.org' .
					bd:serviceParam mwapi:language "en" .
					bd:serviceParam mwapi:srsearch '""" + wcs_string + """' .
					?title wikibase:apiOutput mwapi:title .
				}
			} 		
			"""
            # start the WK query
            try:
                print("Starting WK query for: ", wcs_string)
                requestWK = _session.post(
                    _URL_WK,
                    data={"query": queryWK},
                    headers={"accept": "application/json"})
            except Exception as exc:
                print(
                    "\n*** ERROR getUrlsLinked2Wikicats(): Error starting WK query for",
                    wcs_string, ":", exc)
                _appendFile(
                    logFilename,
                    "ERROR getUrlsLinked2Wikicats(): Error starting WK query for "
                    + wcs_string + ": " + repr(exc))
                requestWK = None

            requestObjects[wikicat].update(
                {"wk":
                 requestWK})  # store the request WK object for this wikicat
            requestDone = 1

        if requestDone == 1:
            time.sleep(3)  # delay to avoid server rejects for too many queries

    print("\n** ALL PENDING QUERIES LAUNCHED\n")

    # End of the first phase. All queries launched. Now, for every wikicat, we have:
    # requestObjects[wikicat] = {"dburls": URLs} or  {"db": requestDB}
    #                       and {"wkurls": URLS} or  {"wk": requestWK}

    # let's build an object {"db": urlsDB, "wk": urlsWK} for each wikicat (each field is a URL list)
    urlsObjects = {}

    # Second phase. Now, read the results received from all queries

    for wikicat in selectedWikicats:

        # first, study results for DB

        try:
            urlsDB = requestObjects[wikicat][
                "dburls"]  # try to recover local DB results
        except:
            requestDB = requestObjects[wikicat][
                "db"]  # no local DB results, get the request DB object for this wikicat

            if requestDB == None:  # error starting DB query, return []
                urlsDB = []
            else:
                try:
                    try:
                        print("Waiting DB query result for:", wikicat)
                        responseDB = requestDB.result(
                        )  # waiting for DB query completion
                    except:
                        raise Exception("timeout")

                    if responseDB.status_code != 200:  # check if DB query ended correctly
                        raise Exception("answer is not 200, is " +
                                        str(responseDB.status_code))

                    try:
                        responseDBJson = responseDB.json()
                    except:
                        raise Exception("error decoding JSON")

                    try:
                        bindingsDB = responseDBJson["results"]["bindings"]
                    except:
                        raise Exception("no [results][bindings] in the answer")

                    # remove bindings with no pt field (isPrimaryTopicOf), because they don't correspond to DBpedia entities ???
                    bindingsDBwithPT = list(filter(_hasFieldPT, bindingsDB))
                    urlsDB = list(
                        map(lambda x: x["pt"]["value"], bindingsDBwithPT)
                    )  # keep only the URL in x["pt"]["value"]

                    if len(urlsDB) > 0:
                        _saveFile(
                            _URLs_FOLDER + "/_Wikicat_" + wikicat +
                            "_DB_Urls.txt", '\n'.join(urlsDB)
                        )  # save all results from DB for this wikicat
                    else:
                        print(
                            "*** getUrlsLinked2Wikicats(): ", wikicat,
                            " provided 0 DB URLs from " +
                            str(len(bindingsDB)) + " results")
                        _appendFile(
                            logFilename, "getUrlsLinked2Wikicats(): " +
                            wikicat + " provided 0 DB URLs from " +
                            str(len(bindingsDB)) + " results")

                except Exception as exc:
                    print(
                        "*** ERROR getUrlsLinked2Wikicats(): Error querying DB for",
                        wikicat, ":", exc)
                    _appendFile(
                        logFilename,
                        "ERROR getUrlsLinked2Wikicats(): Error querying DB for "
                        + wikicat + ": " + repr(exc))
                    urlsDB = []

        # end for DB, we already have urlsDB

        # second, study results for WK

        wcs = _getWikicatComponents(wikicat)
        wcs_string = " ".join(wcs)

        try:
            urlsWK = requestObjects[wikicat][
                "wkurls"]  # try to recover local WK results
        except:
            requestWK = requestObjects[wikicat][
                "wk"]  # no local WK results, get the request WK object for this wikicat

            # WK results come without prefix "https://en.wikipedia.org/wiki/", this function adds it
            def addWKPrefix(x):
                return "https://en.wikipedia.org/wiki/" + x["title"][
                    "value"].replace(" ", "_")

            if requestWK == None:  # error starting WK query, return []
                urlsWK = []
            else:
                try:
                    try:
                        print("Waiting WK query result for:", wikicat)
                        responseWK = requestWK.result(
                        )  # waiting for WK query completion
                    except:
                        raise Exception("timeout")

                    if responseWK.status_code != 200:  # check if WK query ended correctly
                        raise Exception("answer is not 200, is " +
                                        str(responseWK.status_code))

                    try:
                        responseWKJson = responseWK.json()
                    except:
                        raise Exception("error decoding JSON")

                    try:
                        bindingsWK = responseWKJson["results"]["bindings"]
                    except:
                        raise Exception("no [results][bindings] in the answer")

                    urlsWK = list(
                        map(addWKPrefix, bindingsWK)
                    )  # add WK prefix to x["title"]["value"], changing space by '_'

                    if len(urlsWK) > 0:
                        _saveFile(
                            _URLs_FOLDER + "/_Wikicat_" + wikicat +
                            "_WK_Urls.txt", '\n'.join(urlsWK)
                        )  # save all results from WK for this wikicat
                    else:
                        print("*** getUrlsLinked2Wikicats(): ", wikicat,
                              " provided 0 WK URLs")
                        _appendFile(
                            logFilename, "getUrlsLinked2Wikicats(): " +
                            wikicat + " provided 0 WK URLs")

                except Exception as exc:
                    print(
                        "*** ERROR getUrlsLinked2Wikicats(): Error querying WK for",
                        wcs_string, ":", exc)
                    _appendFile(
                        logFilename,
                        "ERROR getUrlsLinked2Wikicats(): Error querying WK for "
                        + wcs_string + ": " + repr(exc))
                    urlsWK = []

        # end for WK, we already have urlsWK

        # store results for this wikicat
        urlsObjects[wikicat] = {"db": urlsDB, "wk": urlsWK}

    print("\n** RECEIVED ALL RESULTS FOR PENDING QUERIES\n")

    return urlsObjects  # return results to buildCorpus function
Exemple #5
0
def buildCorpus2():

    logFilename = "corpus.log"
    logFile = _Open(logFilename, "w")
    logFile.write(str(datetime.now()) + "\n")
    logFile.close()

    originalText = request.values.get(
        "text")  # get parameter with original text
    lenOriginalText = len(originalText)

    selectedWikicats = json.loads(
        request.values.get("wikicats"))  # get parameter with selected wikicats
    print("Number of selected wikicats:", len(selectedWikicats))
    numUrlsDB = 0
    numUrlsWK = 0

    # store the selected wikicats in the file $CORPUS_FOLDER/length.selected.wk
    _saveFile(_CORPUS_FOLDER + "/" + str(lenOriginalText) + ".selected.wk",
              '\n'.join(selectedWikicats))

    # read the original text subjects from local store
    filename_sb = _CORPUS_FOLDER + "/" + str(
        lenOriginalText) + ".sb"  # filename for subjects (length.sb)
    try:
        with _Open(filename_sb) as fp:
            sbOriginalText = fp.read().splitlines()
    except:
        sbOriginalText = []  # no subjects for original text
        _appendFile(logFilename, "Subjects file not available: " + filename_sb)

    # Now, we have wikicats in 'selectedWikicats' and subjects in 'sbOriginalText'

    overwriteCorpus = json.loads(
        request.values.get("overwriteCorpus")
    )  # read the flag parameter overwriteCorpus from request

    if overwriteCorpus:  # if overwriteCorpus, remove current corpus  (URLs, scrapped pages and wikicats files)
        print("Deleting current URLs lists...")
        shutil.rmtree(_URLs_FOLDER)
        print("Deleting current scrapped texts...")
        shutil.rmtree(_SCRAPPED_TEXT_PAGES_FOLDER)

    # create the folder to store two files per wikicat, with the URLs linked to such wikicat coming from DB and WK
    # it must be done before calling the getUrlsLinked2Wikicats function, that it stores there files if fetched

    if not os.path.exists(_URLs_FOLDER):
        os.makedirs(_URLs_FOLDER)

    if not os.path.exists(
            _SCRAPPED_TEXT_PAGES_FOLDER
    ):  # create the folder to store scrapped pages and wikicat files
        os.makedirs(_SCRAPPED_TEXT_PAGES_FOLDER)

    # now get the URLs associated to any of those wikicats (this function is below)
    # it reads from local files if exist, otherwise it connects to Internet to fetch them and store them locally

    urlsObjects = getUrlsLinked2Wikicats(selectedWikicats, logFilename)

    # it has been received a dictionary entry for each wikicat   urlsObjects[wikicat] = {"db": urlsDB, "wk": urlsWK}
    # urlsDB and urlsWK are lists of URLs

    result = {}  # object to store the results to be returned to the request
    fullList = []  # to aggregate the full list of URLs for all wikicats

    # process all results to return

    print("Number of URLs for every wikicat: ", end='')

    for wikicat in selectedWikicats:

        # first, the results from DB

        dbUrls = urlsObjects[wikicat]["db"]  # get the set of DB URLs
        numUrlsDB += len(dbUrls)

        fullList.extend(
            dbUrls)  # add the DB URLs of current wikicat to the whole list

        # now, the results from WK

        wkUrls = urlsObjects[wikicat]["wk"]
        numUrlsWK += len(wkUrls)

        fullList.extend(wkUrls)

        longs1 = "(DB=" + str(len(dbUrls)) + ", WK=" + str(len(wkUrls)) + ")"
        print(wikicat, longs1, end=', ')
        result[wikicat] = {
            "db": len(dbUrls),
            "wk": len(wkUrls)
        }  # add results for this wikicat

    listWithoutDuplicates = list(set(fullList))  # remove duplicated URLs
    lenOfListWithoutDuplicates = len(
        listWithoutDuplicates)  # length of full list to process
    print("\n\nSummary of URLs numbers: DB=", numUrlsDB, ", WK= ", numUrlsWK,
          ", total without duplicates=", lenOfListWithoutDuplicates)

    _appendFile(
        logFilename,
        "Number of discovered URLs: " + str(lenOfListWithoutDuplicates))

    # returns number of results, the result items are only the numbers of discovered URLs
    result["totalDB"] = numUrlsDB
    result["totalWK"] = numUrlsWK
    result["totalUrls"] = len(listWithoutDuplicates)
    # return jsonify(result);  # uncomment to return to the interface without processing files

    if aux.PSTOP == True:
        input("Type ENTER to continue...")

    ###  We've got the first set of relevant URLs, available in listWithoutDuplicates, and stored in the URLs folder
    ###  Let's start the analysis of their contents

    print("\n Downloading and cleaning candidate texts...")

    scrap = _scrapFunctions()  # Create a scrapFunctions object to clean pages
    unretrieved_pages_list = []  # a list for unsuccessful pages retrieval

    nowDownloaded = 0  # number of files downloaded from Internet in this iteration

    listEnoughContent = [
    ]  # list of pages with sufficient content to proceed  ( > _CORPUS_MIN_TXT_SIZE bytes, a constant from aux.py)
    listNotEnoughContent = [
    ]  # list of pages with insufficient content to proceed

    # download not locally stored pages, scrap them, and save them
    for idx, page in enumerate(listWithoutDuplicates, start=1):

        print("(", idx, "of", lenOfListWithoutDuplicates, ") -- ", page)

        # scrapped pages will be stored classified by domain, in specific folders
        # currently, only "en.wikipedia.org" domain is used

        pageWithoutHTTP = page[2 +
                               page.find("//"):]  # get the domain of this page
        domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")]

        if (not os.path.exists(_SCRAPPED_TEXT_PAGES_FOLDER + "/" +
                               domainFolder)
            ):  # create this domain folder if not exists
            os.makedirs(_SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder)

        # the pagename will be the name of the file, with the following change
        # dir1/dir2/page --> dir1..dir2..page.txt

        onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):]
        onlyPageChanged = onlyPage.replace("/", "..")

        # Add file extension '.txt' to page name for saving it   !!!!!!!!!!
        # pageFinalName = page[1+page.rindex("/"):]
        fileNameCandidate = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged + ".txt"

        if (os.path.exists(fileNameCandidate)):
            print("File already available in local DB:", fileNameCandidate)
            fsize = os.path.getsize(fileNameCandidate)
            if fsize < _CORPUS_MIN_TXT_SIZE:
                listNotEnoughContent.append(page)
            else:
                listEnoughContent.append(page)
        else:  # fetch file if not exists
            try:  # Retrieves the URL, and get the page title and the scraped page content
                pageName, pageContent = scrap.scrapPage(
                    page)  # pageName result is not used
                nowDownloaded += 1
                _saveFile(fileNameCandidate, pageContent)  # Save to text file
                print("File", str(nowDownloaded), "downloaded and saved it:",
                      fileNameCandidate)

                if (len(pageContent) < _CORPUS_MIN_TXT_SIZE):
                    listNotEnoughContent.append(page)
                else:
                    listEnoughContent.append(page)
            except Exception as exc:
                _appendFile(
                    logFilename,
                    "Page " + page + " could not be retrieved: " + repr(exc))
                unretrieved_pages_list.append(page)

    # Save the unretrieved_pages_list to a file
    print("")
    print(str(len(unretrieved_pages_list)) + " unretrieved pages")
    _saveFile(_UNRETRIEVED_PAGES_FILENAME, '\n'.join(unretrieved_pages_list))

    lenListEnoughContent = len(listEnoughContent)

    _appendFile(
        logFilename, "Number of available pages with enough content: " +
        str(lenListEnoughContent))

    print("ALL PAGES AVAILABLE AND CLEANED.")
    print("New pages downloaded in this iteration:", str(nowDownloaded))
    print("Number of pages with enough content:", str(lenListEnoughContent))
    print("Number of pages without enough content:",
          str(len(listNotEnoughContent)))

    if aux.PSTOP == True:
        input("Type ENTER to continue...")

    # all the pages not already available have been now fetched and cleaned

    # # Create a new csv file if not exists. QUE SIGNIFICA W+ ? Temporalmente desactivado hasta que este claro lo que guardar
    # with _Open(_SIMILARITIES_CSV_FILENAME, 'w+') as writeFile:
    # 	# Name columns
    # 	fieldnames = ['URL', 'Euclidean Distance', 'Spacy', 'Doc2Vec Euclidean Distance',
    # 	'Doc2Vec Cosine Similarity', 'Trained Doc2Vec Euclidean Distance', 'Trained Doc2Vec Cosine Similarity',
    # 	'Wikicats Jaccard Similarity']
    #
    # 	# Create csv headers
    # 	writer = csv.DictWriter(writeFile, fieldnames=fieldnames, delimiter=";")
    #
    # 	# Write the column headers
    # 	writer.writeheader()

    print("")
    print(
        "Identifying wikicats and subjects for candidate texts with DBpedia SpotLight..."
    )
    currentDownloaded = 0

    listWithWikicats = []  # list of pages with available wikicats
    listWithoutWikicats = []  # list of pages with no wikicats

    for idx, page in enumerate(listEnoughContent, start=1):
        print("\n(", idx, "of", lenListEnoughContent, ") -- ", page)

        # Build filenames for this page
        pageWithoutHTTP = page[2 + page.find("//"):]
        domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")]
        onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):]
        onlyPageChanged = onlyPage.replace("/", "..")
        fileNameCandidateBase = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged
        fileNameCandidate = fileNameCandidateBase + ".txt"
        fileNameCandidateWikicats = fileNameCandidateBase + ".wk"  # wikicats file for this page
        fileNameCandidateSubjects = fileNameCandidateBase + ".sb"  # subjects file for this page

        # if both files (wikicats and subjects) exists, use them from local store
        if os.path.exists(fileNameCandidateWikicats) and os.path.exists(
                fileNameCandidateSubjects):
            print("Files WK and SB already available in local DB for",
                  fileNameCandidate)
            fwsize = os.path.getsize(fileNameCandidateWikicats)
            fssize = os.path.getsize(fileNameCandidateSubjects)
            # if one of these two files is empty (no wikicats or no subjects), this page will not be used
            if (fwsize == 0) or (fssize == 0):
                listWithoutWikicats.append(page)
            else:
                listWithWikicats.append(page)
        else:  # if one file not exists, fetch from Internet candidate text wikicats and subjects
            try:  # open and read text of candidate file
                candidateTextFile = _Open(fileNameCandidate, "r")
                candidate_text = candidateTextFile.read()
                print("Reading candidate text file:", fileNameCandidate)
            except:  # file that inexplicably could not be read from local store, it will not be used
                _appendFile(
                    logFilename,
                    "ERROR buildCorpus2(): Unavailable candidate file, not in the store, but it should be: "
                    + fileNameCandidate)
                listWithoutWikicats.append(page)
                continue

            print("Computing wikicats and subjects for:", page)
            candidate_text_categories = _getCategoriesInText(
                candidate_text
            )  # function _getCategoriesInText from px_DB_Manager

            if ("error" in candidate_text_categories
                ):  # error while fetching info, the page will not be used
                _appendFile(
                    logFilename,
                    "ERROR buildCorpus2(): Problem in _getCategoriesInText(candidate_text): "
                    + candidate_text_categories["error"])
                listWithoutWikicats.append(page)
                continue

            print("Wikicats and subjects downloaded for", fileNameCandidate)
            candidate_text_wikicats = list(
                filter(_filterSimpleWikicats,
                       candidate_text_categories["wikicats"])
            )  # remove simple wikicats with function from aux.py
            candidate_text_subjects = list(
                filter(_filterSimpleSubjects,
                       candidate_text_categories["subjects"])
            )  # remove simple subjects with function from aux.py

            _saveFile(fileNameCandidateWikicats,
                      '\n'.join(candidate_text_wikicats)
                      )  # save file with original text wikicats, one per line
            _saveFile(fileNameCandidateSubjects,
                      '\n'.join(candidate_text_subjects)
                      )  # save file with original text subjects, one per line
            currentDownloaded += 1

            # if no wikicats or no subjects, teh page will not be used
            if (len(candidate_text_wikicats)
                    == 0) or (len(candidate_text_subjects) == 0):
                listWithoutWikicats.append(page)
            else:
                listWithWikicats.append(page)

    lenListWithWikicats = len(listWithWikicats)

    _appendFile(
        logFilename, "Number of available pages with wikicats and subjects: " +
        str(lenListWithWikicats))

    print("")
    print("ALL WIKICATs AND SUBJECTs COMPUTED.")
    print("New items computed in this iteration:", str(currentDownloaded))
    print("Number of pages with wikicats:", str(len(listWithWikicats)))
    print("Number of pages without wikicats:", str(len(listWithoutWikicats)))

    if aux.PSTOP == True:
        input("Type ENTER to continue...")

    print("\n Computing similarities...")

    discarded_pages_list = []  # a list to save discarded pages' URLs
    similarity = _textSimilarityFunctions(
    )  # Create a textSimilarityFunctions object to measure text similarities

    # variables to store results
    sims_wk_sb = [
    ]  # list of triplets (filenameCandidate, similarityByWikicats, similarityBySubjects)
    distribution_wk = {
        "0": 0,
        "1": 0,
        "2": 0,
        "3": 0,
        "4": 0,
        "5": 0,
        "6": 0,
        "7": 0,
        "8": 0,
        "9": 0
    }
    distribution_sb = {
        "0": 0,
        "1": 0,
        "2": 0,
        "3": 0,
        "4": 0,
        "5": 0,
        "6": 0,
        "7": 0,
        "8": 0,
        "9": 0
    }

    # Measure text similarity, and discard pages (discarded_pages_list) without a minimum similarity
    for idx, page in enumerate(listWithWikicats, start=1):

        print("(", idx, "of", lenListWithWikicats, ") -- ", page)

        # Build filename for this page
        pageWithoutHTTP = page[2 + page.find("//"):]
        domainFolder = pageWithoutHTTP[:pageWithoutHTTP.find("/")]
        onlyPage = pageWithoutHTTP[1 + pageWithoutHTTP.find("/"):]
        onlyPageChanged = onlyPage.replace("/", "..")
        fileNameCandidateBase = _SCRAPPED_TEXT_PAGES_FOLDER + "/" + domainFolder + "/" + onlyPageChanged
        fileNameCandidate = fileNameCandidateBase + ".txt"
        fileNameCandidateWikicats = fileNameCandidateBase + ".wk"
        fileNameCandidateSubjects = fileNameCandidateBase + ".sb"

        # try:  # open and read local file if already exists
        # 	candidateTextFile = _Open(fileNameCandidate, "r")
        # 	pageContent = candidateTextFile.read()
        # 	print("Reading file:", fileNameCandidate)
        # except:  # file that could not be downloaded
        # 	print("ERROR buildCorpus2(): Unavailable file, not in the store, but it should be:", fileNameCandidate)
        # 	input("ENTER to continue...")
        # 	continue

        # Compare original text with the text of this candidate (in pageContent)
        # several criteria are now computed. THEIR RELEVANCE SHOULD BE STUDIED AS SOON AS POSSIBLE

        # Measure text similarity based on the Lee doc2vec model

        # doc2vec_cosineSimilarity, doc2vec_euclideanDistance = similarity.doc2VecTextSimilarity(originalText, pageContent, _LEE_D2V_MODEL)
        # print("Lee Doc2Vec CS = "+str(doc2vec_cosineSimilarity))
        # print("Lee Doc2Vec ED = "+str(doc2vec_euclideanDistance))
        #
        # # Measure text similarity based on the trained doc2vec model with our training corpus
        # doc2vec_trained_cosineSimilarity, doc2vec_trained_euclideanDistance = similarity.doc2VecTextSimilarity(originalText, pageContent, _OWN_D2V_MODEL)
        # print("Trained Doc2Vec CS = "+str(doc2vec_trained_cosineSimilarity))
        # print("Trained Doc2Vec ED = "+str(doc2vec_trained_euclideanDistance))
        #
        # # Measure the euclidean distance using SKLEARN
        # euclidean_distance = similarity.euclideanTextSimilarity(originalText, pageContent)
        # print("Euclidean distance = "+str(euclidean_distance))
        #
        # # Measure the spaCy distance
        # spacy_similarity = similarity.spacyTextSimilarity(originalText, pageContent)
        # print("Spacy similarity = "+str(spacy_similarity))

        # Measure wikicats similarity (requires complete matching)
        # wikicats_jaccard_similarity, subjects_jaccard_similarity = similarity.fullWikicatsAndSubjectsSimilarity(originalText, pageContent)
        # print("Wikicats full jaccard similarity = "+str(wikicats_jaccard_similarity))
        # print("Subjects full jaccard similarity = "+str(subjects_jaccard_similarity))

        # Measure wikicats similarity (requires shared matching)
        shared_wikicats_jaccard_similarity = similarity.sharedWikicatsSimilarity(
            selectedWikicats, fileNameCandidateWikicats, logFilename)
        print("Wikicats shared jaccard similarity = " +
              str(shared_wikicats_jaccard_similarity))

        shared_subjects_jaccard_similarity = similarity.sharedSubjectsSimilarity(
            sbOriginalText, fileNameCandidateSubjects, logFilename)
        print("Subjects shared jaccard similarity = " +
              str(shared_subjects_jaccard_similarity))

        sims_wk_sb.append(
            (fileNameCandidate, shared_wikicats_jaccard_similarity,
             shared_subjects_jaccard_similarity))

        # to compute distributions
        if shared_wikicats_jaccard_similarity == -1:
            _appendFile(
                logFilename, "ERROR computing sharedWikicatsJaccard: " +
                fileNameCandidateWikicats)
        else:
            if shared_wikicats_jaccard_similarity < 0.1:
                distribution_wk["0"] = distribution_wk["0"] + 1
            elif shared_wikicats_jaccard_similarity < 0.2:
                distribution_wk["1"] = distribution_wk["1"] + 1
            elif shared_wikicats_jaccard_similarity < 0.3:
                distribution_wk["2"] = distribution_wk["2"] + 1
            elif shared_wikicats_jaccard_similarity < 0.4:
                distribution_wk["3"] = distribution_wk["3"] + 1
            elif shared_wikicats_jaccard_similarity < 0.5:
                distribution_wk["4"] = distribution_wk["4"] + 1
            elif shared_wikicats_jaccard_similarity < 0.6:
                distribution_wk["5"] = distribution_wk["5"] + 1
            elif shared_wikicats_jaccard_similarity < 0.7:
                distribution_wk["6"] = distribution_wk["6"] + 1
            elif shared_wikicats_jaccard_similarity < 0.8:
                distribution_wk["7"] = distribution_wk["7"] + 1
            elif shared_wikicats_jaccard_similarity < 0.9:
                distribution_wk["8"] = distribution_wk["8"] + 1
            else:
                distribution_wk["9"] = distribution_wk["9"] + 1

        if shared_subjects_jaccard_similarity == -1:
            _appendFile(
                logFilename, "ERROR computing sharedSubjectsJaccard: " +
                fileNameCandidateSubjects)
        else:
            if shared_subjects_jaccard_similarity < 0.1:
                distribution_sb["0"] = distribution_sb["0"] + 1
            elif shared_subjects_jaccard_similarity < 0.2:
                distribution_sb["1"] = distribution_sb["1"] + 1
            elif shared_subjects_jaccard_similarity < 0.3:
                distribution_sb["2"] = distribution_sb["2"] + 1
            elif shared_subjects_jaccard_similarity < 0.4:
                distribution_sb["3"] = distribution_sb["3"] + 1
            elif shared_subjects_jaccard_similarity < 0.5:
                distribution_sb["4"] = distribution_sb["4"] + 1
            elif shared_subjects_jaccard_similarity < 0.6:
                distribution_sb["5"] = distribution_sb["5"] + 1
            elif shared_subjects_jaccard_similarity < 0.7:
                distribution_sb["6"] = distribution_sb["6"] + 1
            elif shared_subjects_jaccard_similarity < 0.8:
                distribution_sb["7"] = distribution_sb["7"] + 1
            elif shared_subjects_jaccard_similarity < 0.9:
                distribution_sb["8"] = distribution_sb["8"] + 1
            else:
                distribution_sb["9"] = distribution_sb["9"] + 1

        # # Save similarity to a CSV file
        # with _Open(_SIMILARITIES_CSV_FILENAME, 'a') as writeFile:
        # 	writer = csv.writer(writeFile, delimiter=';')
        # 	writer.writerow([page, euclidean_distance, spacy_similarity, doc2vec_euclideanDistance,
        # 	doc2vec_cosineSimilarity, doc2vec_trained_euclideanDistance, doc2vec_trained_cosineSimilarity, shared_wikicats_jaccard_similarity])

        # Minimum similarity for a page to be accepted.
        # WE MUST DECIDE THE MOST RELEVANT CRITERIUM TO DECIDE ON IT
        # currently, we used shared_wikicats_jaccard_similarity

    min_similarity = 0.3  # review this threshold

    both_above_min = list(
        filter(
            lambda triple: (
                (triple[1] > min_similarity) and (triple[2] > min_similarity)),
            sims_wk_sb))

    _appendFile(
        logFilename, "Number of pages with both similarities above " +
        str(min_similarity) + " = " + str(len(both_above_min)))
    print("Number of pages with both similarities above", min_similarity, "=",
          len(both_above_min))

    sims_wk_sb_str = list(
        map(
            lambda triple:
            (triple[0] + " " + str(triple[1]) + " " + str(triple[2])),
            sims_wk_sb))
    _saveFile(_CORPUS_FOLDER + "/" + str(lenOriginalText) + ".sims",
              '\n'.join(sims_wk_sb_str))

    result["distribution_wk"] = distribution_wk
    result["distribution_sb"] = distribution_sb

    # Save the discarded_pages_list to a file
    _saveFile(_DISCARDED_PAGES_FILENAME, '\n'.join(discarded_pages_list))
    # print(str(len(discarded_pages_list)) + " discarded pages")

    # print distributions
    t0 = distribution_wk["0"]
    p0 = 100 * t0 / lenListWithWikicats

    t1 = distribution_wk["1"]
    p1 = 100 * t1 / lenListWithWikicats
    t1a = t0 + t1
    p1a = 100 * t1a / lenListWithWikicats

    t2 = distribution_wk["2"]
    p2 = 100 * t2 / lenListWithWikicats
    t2a = t1a + t2
    p2a = 100 * t2a / lenListWithWikicats

    t3 = distribution_wk["3"]
    p3 = 100 * t3 / lenListWithWikicats
    t3a = t2a + t3
    p3a = 100 * t3a / lenListWithWikicats

    t4 = distribution_wk["4"]
    p4 = 100 * t4 / lenListWithWikicats
    t4a = t3a + t4
    p4a = 100 * t4a / lenListWithWikicats

    t5 = distribution_wk["5"]
    p5 = 100 * t5 / lenListWithWikicats
    t5a = t4a + t5
    p5a = 100 * t5a / lenListWithWikicats

    t6 = distribution_wk["6"]
    p6 = 100 * t6 / lenListWithWikicats
    t6a = t5a + t6
    p6a = 100 * t6a / lenListWithWikicats

    t7 = distribution_wk["7"]
    p7 = 100 * t7 / lenListWithWikicats
    t7a = t6a + t7
    p7a = 100 * t7a / lenListWithWikicats

    t8 = distribution_wk["8"]
    p8 = 100 * t8 / lenListWithWikicats
    t8a = t7a + t8
    p8a = 100 * t8a / lenListWithWikicats

    t9 = distribution_wk["9"]
    p9 = 100 * t9 / lenListWithWikicats
    t9a = t8a + t9
    p9a = 100 * t9a / lenListWithWikicats

    print("TOTAL WIKICATS = ", lenListWithWikicats)
    print("0: %6d - %8.2f - %8.2f" % (t0, p0, p0))
    print("1: %6d - %8.2f - %8.2f" % (t1, p1, p1a))
    print("2: %6d - %8.2f - %8.2f" % (t2, p2, p2a))
    print("3: %6d - %8.2f - %8.2f" % (t3, p3, p3a))
    print("4: %6d - %8.2f - %8.2f" % (t4, p4, p4a))
    print("5: %6d - %8.2f - %8.2f" % (t5, p5, p5a))
    print("6: %6d - %8.2f - %8.2f" % (t6, p6, p6a))
    print("7: %6d - %8.2f - %8.2f" % (t7, p7, p7a))
    print("8: %6d - %8.2f - %8.2f" % (t8, p8, p8a))
    print("9: %6d - %8.2f - %8.2f" % (t9, p9, p9a))

    t0 = distribution_sb["0"]
    p0 = 100 * t0 / lenListWithWikicats

    t1 = distribution_sb["1"]
    p1 = 100 * t1 / lenListWithWikicats
    t1a = t0 + t1
    p1a = 100 * t1a / lenListWithWikicats

    t2 = distribution_sb["2"]
    p2 = 100 * t2 / lenListWithWikicats
    t2a = t1a + t2
    p2a = 100 * t2a / lenListWithWikicats

    t3 = distribution_sb["3"]
    p3 = 100 * t3 / lenListWithWikicats
    t3a = t2a + t3
    p3a = 100 * t3a / lenListWithWikicats

    t4 = distribution_sb["4"]
    p4 = 100 * t4 / lenListWithWikicats
    t4a = t3a + t4
    p4a = 100 * t4a / lenListWithWikicats

    t5 = distribution_sb["5"]
    p5 = 100 * t5 / lenListWithWikicats
    t5a = t4a + t5
    p5a = 100 * t5a / lenListWithWikicats

    t6 = distribution_sb["6"]
    p6 = 100 * t6 / lenListWithWikicats
    t6a = t5a + t6
    p6a = 100 * t6a / lenListWithWikicats

    t7 = distribution_sb["7"]
    p7 = 100 * t7 / lenListWithWikicats
    t7a = t6a + t7
    p7a = 100 * t7a / lenListWithWikicats

    t8 = distribution_sb["8"]
    p8 = 100 * t8 / lenListWithWikicats
    t8a = t7a + t8
    p8a = 100 * t8a / lenListWithWikicats

    t9 = distribution_sb["9"]
    p9 = 100 * t9 / lenListWithWikicats
    t9a = t8a + t9
    p9a = 100 * t9a / lenListWithWikicats

    print("TOTAL SUBJECTS = ", lenListWithWikicats)
    print("0: %6d - %8.2f - %8.2f" % (t0, p0, p0))
    print("1: %6d - %8.2f - %8.2f" % (t1, p1, p1a))
    print("2: %6d - %8.2f - %8.2f" % (t2, p2, p2a))
    print("3: %6d - %8.2f - %8.2f" % (t3, p3, p3a))
    print("4: %6d - %8.2f - %8.2f" % (t4, p4, p4a))
    print("5: %6d - %8.2f - %8.2f" % (t5, p5, p5a))
    print("6: %6d - %8.2f - %8.2f" % (t6, p6, p6a))
    print("7: %6d - %8.2f - %8.2f" % (t7, p7, p7a))
    print("8: %6d - %8.2f - %8.2f" % (t8, p8, p8a))
    print("9: %6d - %8.2f - %8.2f" % (t9, p9, p9a))

    return jsonify(result)