コード例 #1
0
def processS1File(filename):
    with open(filename, 'r') as content_file:
        content = content_file.read(
        )  # the original content of the file is read

        # to process a textual content
        # returns a 4-tupla (text changed, html text with highlighted changes, no changes report, HTML no changes report)
        result = _processContent(content)

        if result == None:
            _Print("no change")
            _saveFile(
                filename + ".s", content
            )  # store result without changes in a file with '.s' extension
            _saveFile(
                filename + ".s.html",
                content)  # store result without changes in an HTML report file
        else:
            _Print("some changes")
            _saveFile(
                filename + ".s", result[0]
            )  # store result with changes in a file with '.s' extension
            _saveFile(
                filename + ".s.html",
                result[1])  # store result with changes in an HTML report file

            # store results reporting studied changes finally not done
            if result[2] != "":
                _saveFile(filename + ".s.nr", result[2])
                _saveFile(filename + ".s.nr.html", result[3])
    return
コード例 #2
0
ファイル: textSimilarities.py プロジェクト: agilll/Plethora
	def sharedSubjectsJaccardSimilarity (self, fileNameCandidateSubjects):

		try:  # try to read candidate text subjects from local DB
			with _Open(fileNameCandidateSubjects) as fp:
				candidate_text_subjects = fp.read().splitlines()
		except Exception as e:
			_Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects)
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e))
			return -1

		if len(candidate_text_subjects) == 0:
			return 0

		# the subjects lists for both texts are now available
		subjects_jaccard_similarity = 0

		try:
			# change every candidate subject by the pair (subject, list of subject components)
			pairs_candidate_text_subjects = list(map(lambda x: (x, _getSubjectComponents(x)), candidate_text_subjects))

			numContributions=0  # number of matches - contributions with some similarity
			sum_sims = 0  # to aggregate similarities contributions

			for (sbo,sbocl) in self.pairs_original_text_subjects:
				for (sbc,sbccl) in pairs_candidate_text_subjects:
					min_long = min(len(sbocl), len(sbccl)) # length of the shorter subject

					if (min_long < 3):  # both subjects must have at least 3 components
						continue

					intersection_cardinality = len(set.intersection(set(sbocl), set(sbccl)))

					# for the shorter subject, we require at most 1 component not to be included in the larger subject
					if (intersection_cardinality < (min_long - 1)):
						continue

					# this fulfills the requirements: it is a contribution

					numContributions += 1
					union_cardinality = len(set.union(set(sbocl), set(sbccl)))
					component_jaccard_similarity = intersection_cardinality/float(union_cardinality)
					sum_sims += component_jaccard_similarity
					_Print(numContributions, "->", sbo, ",", sbc, component_jaccard_similarity)

					if numContributions == 0: # no intersection at all
						return 0

					subjects_jaccard_similarity = sum_sims / numContributions
		except Exception as e:
			_Print("ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e))
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): Exception while computing Jaccard subjects similarity: "+str(e))
			return -1

		if subjects_jaccard_similarity > 1:
			_Print("Candidate with subjects similarity > 1:", fileNameCandidateSubjects, sum_sims, denominator, subjects_jaccard_similarity)
			_appendFile(self.logFilename, "ERROR sharedSubjectsJaccardSimilarity(): similarity > 1")
			return -1

		return subjects_jaccard_similarity
コード例 #3
0
ファイル: textSimilarities.py プロジェクト: agilll/Plethora
	def fullSubjectsJaccardSimilarity (self, fileNameCandidateSubjects):

		try:  # try to read candidate text subjects from local DB
			with _Open(fileNameCandidateSubjects) as fp:
				candidate_text_subjects = fp.read().splitlines()
		except Exception as e:
			_Print("Candidate subjects file not found in local DB:", fileNameCandidateSubjects)
			_appendFile(self.logFilename, "ERROR fullSubjectsJaccardSimilarity(): Candidate subjects file not found: "+fileNameCandidateSubjects+" "+str(e))
			return -1

		if len(self.original_text_subjects) == 0 or len(candidate_text_subjects) == 0:
			return 0

		subjects_jaccard_similarity = self.oMeasures.oJaccardSimilarity(self.original_text_subjects, candidate_text_subjects)

		return subjects_jaccard_similarity
コード例 #4
0
ファイル: S4_tokenize.py プロジェクト: agilll/Plethora
def processS4List(fileList, foldername):

	print("\nS4: Processing list of .w files ")

	if not foldername.endswith("/"):
		foldername = foldername+"/"

	t_folder = foldername + _T_FOLDER
	if not os.path.exists(t_folder):
		os.makedirs(t_folder)

	numFiles = 0
	numProcessed = 0

	for wFullFilename in fileList:
		if not wFullFilename.endswith(".w"):
			continue

		numFiles += 1
		_Print(numFiles, "**************** Processing file ", wFullFilename)

		final_name = wFullFilename[(1+wFullFilename.rfind("/")):]
		tFullFilename = t_folder+final_name+".t"

		if os.path.exists(tFullFilename):
			_Print("T file already available in local DB: "+tFullFilename)
			continue

		_Print("Creating .t file: "+tFullFilename)
		result = processOneFile(wFullFilename)
		pickle.dump(result, open(tFullFilename, "wb" ))

		numProcessed += 1

	return numProcessed
コード例 #5
0
def processS3List(fileList):

    print("\nS3: Processing list of .s files")

    numFiles = 0
    numProcessed = 0

    for sFullFilename in fileList:
        if not sFullFilename.endswith(".s"):
            continue

        numFiles += 1
        _Print(numFiles, "**************** Processing file ", sFullFilename)

        if os.path.exists(sFullFilename + ".w"):
            _Print("W file already available in local DB: " + sFullFilename +
                   ".w")
            continue

        _Print("Creating .w file: " + sFullFilename + ".w")

        pfullfilename = sFullFilename + ".p"
        result = getContentAfterChanges(sFullFilename, pfullfilename)

        # save result in files with the same name and extension '.w'
        _saveFile(sFullFilename + ".w", result[0])
        _saveFile(sFullFilename + ".w.html", result[1])

        highlightedContent = _getContentMarked(sFullFilename + ".w", "w")
        _saveFile(sFullFilename + ".w.p.html", highlightedContent)

        numProcessed += 1

    return numProcessed
コード例 #6
0
def generateAgregate(foldername):
    print("\nCreating aggregation " + foldername + ".s...")
    numFiles = 0
    global_content = ""

    spw_folder = foldername + _SPW_FOLDER

    if not os.path.exists(spw_folder):
        print(spw_folder, "not found!")
        return -1

    for filename in sorted(os.listdir(
            spw_folder)):  # files are ordered by alphabetic file name
        if not filename.endswith(".s"):  # only .s files are joined
            continue
        else:
            numFiles += 1
            _Print(numFiles, "====================", filename)
            with open(spw_folder + "/" + filename, 'r') as content_file:
                content = content_file.read()
                global_content += content

    _saveFile(foldername + "/" + foldername + ".s", global_content)
    return
コード例 #7
0
def processS2File(source, confidence=0.5, support=1):
    if not source.endswith(".s"):
        message = source + " has not '.s' extension"
        print(message)
        raise Exception(message)

    if not os.path.exists(source):
        message = source + " not found!"
        print(message)
        raise Exception(message)

    _Print("Processing file " + source + "...\n")
    try:
        entities = findEntities(source, confidence, support)
        pickle.dump(entities, open(source + ".p", "wb"))

        highlightedContent = _getContentMarked(source, 's')
        _saveFile(source + ".p.html", highlightedContent)
    except Exception as e:
        message = "Problem detecting entities: " + str(e)
        print(message)
        raise Exception(message)

    return 0
コード例 #8
0
def findEntities(filename, confPar, supPar):

    content_file = open(filename, 'r')
    content = content_file.read()

    # DB-SL is queried for teh preferred entity for each candidate detected in the file
    # see section 6.4 of the document describing the architecture for the formats of request and answer
    dbsl_response = requests.post(_URL_DB_SL_annotate,
                                  data={
                                      "text": content,
                                      "confidence": confPar,
                                      "support": supPar
                                  },
                                  headers={
                                      "accept":
                                      "application/json",
                                      "content-type":
                                      "application/x-www-form-urlencoded"
                                  })

    if (dbsl_response.status_code != 200):
        raise Exception("DBpedia SpotLight connection error: " +
                        _URL_DB_SL_annotate)

    # the previous one is a synchronous call, anly returns after receiving the answer, that will be passed now to JSON
    try:
        dbsl_json = dbsl_response.json()
        dbsl_json[
            "Resources"]  # if no entity is detected an exception is raised
    except:
        _Print("No entity detected in the file")
        return {'byUri': {}, 'byType': {}, 'byOffset': {}}

    _Print("Detected", len(dbsl_json["Resources"]), "entities")

    # create class  _DBManager to parse results
    dbpediaManager = _DBManager()
    dbpediaManager.scanEntities(dbsl_json)
    allDicts = dbpediaManager.getDictionaries()

    byUri = allDicts["byUri"]
    byType = allDicts["byType"]
    byOffset = allDicts["byOffset"]
    byuriplana = [item for sublist in byUri.values() for item in sublist]
    _Print(len(byUri.keys()), len(byuriplana), len(byType.keys()),
           len(byOffset.keys()))

    return allDicts
コード例 #9
0
def processS2List(fileList, confidence=0.5, support=1):

    print("\nS2: Processing list of .s files")

    numFiles = 0
    numProcessed = 0

    for sFullFilename in fileList:
        if not sFullFilename.endswith(".s"):
            continue

        numFiles += 1
        _Print(numFiles, " S2: Processing file ", sFullFilename)

        if os.path.exists(sFullFilename + ".p"):
            _Print("P file already available in local DB: " + sFullFilename +
                   ".p")
            continue

        _Print("Creating .p file: " + sFullFilename + ".p")

        try:
            entities = findEntities(sFullFilename, confidence, support)
        except Exception as ex:
            print(
                "processS2List: findEntities raised exception (" + str(ex) +
                ")! Could not process ", sFullFilename)
            input("Continue?")
            continue

        time.sleep(2)
        pickle.dump(entities, open(sFullFilename + ".p", "wb"))

        highlightedContent = _getContentMarked(sFullFilename, "s")
        _saveFile(sFullFilename + ".p.html", highlightedContent)

        numProcessed += 1

    return numProcessed
コード例 #10
0
def processS1List(foldername, fileList):

    print("\nS1: Processing list of .txt files to folder " + foldername)

    if not foldername.endswith("/"):
        foldername = foldername + "/"

    if not os.path.exists(
            foldername
    ):  # create CORPUS folder for output files if does not exist
        os.makedirs(foldername)

    # create the folder to store output files if does not exist
    spw_folder = foldername + _SPW_FOLDER
    if not os.path.exists(spw_folder):
        os.makedirs(spw_folder)

    numFiles = 0
    numProcessed = 0
    for filename in fileList:
        numFiles += 1
        _Print(numFiles, " S1: Processing file ", filename)

        final_name = filename[(1 + filename.rfind("/")):]
        basename = spw_folder + final_name

        if os.path.exists(basename + ".s"):  # this is case insensitive
            _Print("S file already available in local DB: " + basename + ".s")
            continue

        _Print("Creating .s file: " + basename + ".s")

        with open(filename, 'r') as content_file:
            content = content_file.read(
            )  # the original content of file is read

            # to process a textual content
            # returns a tupla (text changed, html text with highlighted changes, no changes report, HTML no changes report)
            result = _processContent(content)

            if result == None:
                _Print("no change, saving .s", basename + ".s")
                _saveFile(
                    basename + ".s", content
                )  # store result without changes in a file with '.s' extension
                _saveFile(
                    basename + ".s.html", content
                )  # store result without changes in an HTML report file
            else:
                _Print("changes, saving .s", basename + ".s")
                _saveFile(
                    basename + ".s", result[0]
                )  # store result with changes in a file with '.s' extension
                _saveFile(basename + ".s.html", result[1]
                          )  # store result with changes in an HTML report file

                # store results reporting studied changes finally not done
                if result[2] != "":
                    _saveFile(basename + ".s.nr", result[2])
                    _saveFile(basename + ".s.nr.html", result[3])

        numProcessed += 1

    return numProcessed
コード例 #11
0
ファイル: px_aux_add_suffix.py プロジェクト: agilll/Plethora
def processContent(content):

    # this is an offset dict, keys are offsets, values the detected matches (object with name and EVENT) in that offset
    wordsWithNumber = {}

    # this is a dict of names, keys are name of matches, values are sets with all the different EVENTS involving such name
    allSubstitutions = {}

    # first pass to content, to detect and store EVENTS

    # match is every case finding EVENTS
    for match in reg_WordWithRomanNumber.finditer(content):
        offset = match.start()  # the position of the EVENT
        # match.groups provides the different parts of the match
        tuplaGroups1 = match.groups(
        )[:-1]  # remove the non-alphanumeric char ending the sequence
        tuplaGroups2 = (tuplaGroups1[0],
                        " ") + tuplaGroups1[1:]  # add a ' ' after the name
        newFullWord = "".join(
            tuplaGroups2
        )  # join all parts, without spaces, result is "name romannumber"
        _Print(match.groups(), " --> ", tuplaGroups2, " --> ", newFullWord)

        word = match.group(1)  # the name
        # add the match to the dict of offsets, key the offset, value an object with the name (word) and the match (fullWord)
        wordsWithNumber[offset] = {
            "fullWord": newFullWord.strip(),
            "word": word
        }  # strip removes spaces before and after the string

        # add the EVENT to the dict of names, a set with all the EVENT of such name
        if word not in allSubstitutions:
            allSubstitutions[word] = {newFullWord}
        else:
            allSubstitutions[word].add(newFullWord)

    _Print('\n-------- Results of the first pass --------')

    _Print("There are", len(wordsWithNumber),
           "EVENTS, proper names followed by an space and a roman number")

    # EVENTS have been detected. Let's go with changes

    # second pass to the content, to make modifications (changes after EVENT or before for unique NAMEs)
    # we study the content from offset to offset, and make modifications depending on current substitutions

    offsets = list(wordsWithNumber.keys())  # list of offsets of every EVENT
    offsets.sort()  # ascending order

    negReportTxt = ""  # text negative report (transformations not done)
    negReportHtml = ""  # html negative report

    if len(offsets) == 0:
        return None

    # dict to storecurrent substitutions, key is the name, value is object with the EVENT and its offset (not necessary)
    currentSubstitutions = {}

    # start with the initial text, before the first EVENT
    initial = content[0:offsets[0]]

    # convert the string to a list of words separated by non-alphanumeric chars
    # with the parenthesis, the detected groups marking the word separation are also returned, that is,
    # words is a list contaning everything in the string, the words and, between them, the non-alphanumeric groups separating them
    words = re.split('(\W+)', initial)
    wordsHTML = re.split('(\W+)', initial)  # the same for the HTML result

    for j in range(0, len(words)):
        if words[j] in allSubstitutions:
            if len(allSubstitutions[words[j]]) == 1:
                sustituto = list(allSubstitutions[words[j]])[
                    0]  # to get the only one set member
                if changeAccordingContext(words, j):
                    # change is done
                    words[j] = sustituto
                    wordsHTML[
                        j] = "<span style='color: green'><b>" + sustituto + "</b></span>"
                else:
                    # change is not done, and it is annotated in the negative report
                    negReportTxt += "Before: (" + sustituto + ")  -->  **"
                    negReportTxt += buildSecureReject(words, j)[0] + "**\n"
                    negReportHtml += "Before: (" + sustituto + ")  -->  **"
                    negReportHtml += buildSecureReject(words, j)[1] + "**<p>"

    # 'finalContent' contains the text after processing and  changes
    finalContent = "".join(words)
    finalContentHTML = "".join(wordsHTML).replace("\n", "<p>")

    # let's go with rest of text after the first offset

    for i in range(0, len(offsets)):  # range(0,n) goes from 0 to n-1
        o = wordsWithNumber[offsets[
            i]]  # the object with the i event, we study the text from here to the following

        # add the event to the final content and to the dict of current substituions
        finalContent += o["fullWord"]
        finalContentHTML += o["fullWord"]

        currentSubstitutions[o["word"]] = {
            "sub": o["fullWord"],
            "offset": offsets[i]
        }  # if that word already existed, it is changed by the new one

        # search the limit of the text fragment, from teh current event to the following one (or teh end of text)
        if i + 1 == len(offsets):
            limit = len(
                content
            )  # this was the last event, limit is the length of the original text
        else:
            limit = offsets[i +
                            1]  # limit is the offset of the following event

        # take the string from the current offset + len(added) to the limit
        currentSubstring = content[offsets[i] + len(o["fullWord"]):limit]

        # convert such string to a list of words separated by non-alphanumeric chars
        # with the parenthesis, the detected groups marking the word separation are also returned, that is,
        # words is a list contaning everything in the string, the words and, between them, the non-alphanumeric groups separating them
        words = re.split('(\W+)', currentSubstring)
        wordsHTML = re.split('(\W+)', currentSubstring)

        # study each one of the words in list, that is, of the string after the current event to limit
        for j in range(0, len(words)):
            # if current word is in the list of substitutions, change it
            if words[j] in currentSubstitutions:
                sustituto = currentSubstitutions[words[j]]["sub"]
                if changeAccordingContext(words, j):
                    # change is done
                    words[j] = sustituto
                    wordsHTML[
                        j] = "<span style='color: green'><b>" + sustituto + "</b></span>"
                else:
                    # change is not done, and it is annotated in the negative report
                    negReportTxt += "After: (" + sustituto + ")  -->  **"
                    negReportTxt += buildSecureReject(words, j)[0] + "**\n"
                    negReportHtml += "After: (" + sustituto + ")  -->  **"
                    negReportHtml += buildSecureReject(words, j)[1] + "**<p>"
            else:
                if words[j] in allSubstitutions:
                    if len(allSubstitutions[words[j]]) == 1:
                        sustituto = list(allSubstitutions[words[j]])[0]
                        if changeAccordingContext(words, j):
                            # change is done
                            words[j] = sustituto
                            wordsHTML[
                                j] = "<span style='color: green'><b>" + sustituto + "</b></span>"
                        else:
                            # change is not done, and it is annotated in the negative report
                            negReportTxt += "Before: (" + sustituto + ")  -->  **"
                            negReportTxt += buildSecureReject(words,
                                                              j)[0] + "**\n"
                            negReportHtml += "Before: (" + sustituto + ")  -->  **"
                            negReportHtml += buildSecureReject(words,
                                                               j)[1] + "**<p>"

        # rebuild the studied fragment from the list of words, and add it to the final content
        finalContent += "".join(words)
        finalContentHTML += "".join(wordsHTML).replace("\n", "<p>")

    return (finalContent, finalContentHTML, negReportTxt, negReportHtml)
コード例 #12
0
def getCategoriesInText(texto):
    import requests
    from px_aux import URL_DB_SL_annotate as _URL_DB_SL_annotate

    result = {}

    # make a query to DB-SL with texto
    try:
        objParams = {"text": texto, "confidence": 0.5, "support": 1}
        # annotateTextRequest = requests.get(_URL_DB_SL_annotate, params=objParams, headers={"Accept": "application/json"})
        annotateTextRequest = requests.post(
            _URL_DB_SL_annotate,
            data=objParams,
            headers={"Accept": "application/json"})
    except Exception as e:
        print("ERROR getCategoriesInText(): Problem querying DB-SL", str(e))
        result["error"] = "Problem querying DB-SL --> " + str(e)
        return result

    try:
        dbpediaText = annotateTextRequest.json()
    except Exception as e:
        print("ERROR getCategoriesInText(): Problem jsoning DB-SL response:",
              annotateTextRequest.content)
        result[
            "error"] = "Problem with json DB-SL response: the query does not return the expected JSON --> " + str(
                e)
        return result

    dbpediaManager = DBManager()

    try:
        dbpediaManager.scanEntities(dbpediaText)
    except Exception as e:
        print(
            "ERROR getCategoriesInText(): Problem scanning DB-SL response: error in scanEntities --> "
            + str(e))
        result[
            "error"] = "Problem with DB-SL: error in scanEntities --> " + str(
                e)
        return result

    entities = dbpediaManager.getEntitiesAfterOffset(0)

    if len(entities) == 0:
        print("Warning getCategoriesInText(): No entities in text")

    print("\nInitially, there are entities:", len(entities))
    for entity in entities:
        _Print(entity["@URI"])

    # filter duplicated entities (same entity identified in different parts of the text)
    uniqueEntities = []
    for entity in entities:
        if entity["@URI"] in list(map(lambda x: x["@URI"], uniqueEntities)):
            continue
        uniqueEntities.append(entity)

    entities = uniqueEntities

    print("\nBut unique entities: ", len(entities))
    for entity in entities:
        _Print(entity["@URI"])

    # filter entities probably erroneously identified
    # a right entity is required to share wikicats with some other entity in the set
    _Print("\nFiltering by wikicats sharing")
    rightEntities = []
    for entity in entities:
        wki = entity["wikicats"]  # wikicats of this entity
        wkic = []  # wikicats of all the entities in the set except this one
        for ej in entities:
            if entity["@URI"] == ej["@URI"]:
                continue

            wkic.extend(ej["wikicats"])
        intersec = set(wkic).intersection(wki)  # is there intersection?
        if len(intersec) > 0:
            rightEntities.append(entity)
        else:
            _Print("Discarded entity: ", entity["@URI"])

    entities = rightEntities

    print("\nAfter the filtering by wikicat sharing there are:", len(entities))
    for entity in entities:
        _Print(entity["@URI"])

    # a right entity is required to share subjects with some other entity in the set
    _Print("\nFiltering by subject sharing")
    rightEntities = []
    for entity in entities:
        sbi = entity["subjects"]  # subjects of this entity
        sbic = []  # subjects of all entities except this
        for ej in entities:
            if entity["@URI"] == ej["@URI"]:
                continue
            sbic.extend(ej["subjects"])
        intersec = set(sbic).intersection(sbi)  # is there intersection?
        if len(intersec) > 0:
            rightEntities.append(entity)
        else:
            _Print("Discarded entity: ", entity["@URI"])

    entities = rightEntities

    print("\nAfter the filtering by subject sharing there are", len(entities))
    for entity in entities:
        _Print(entity["@URI"])

    # to return the list of the wikicats of the entities identified in the text
    wikicats = []
    for entity in entities:
        wikicats.extend(entity["wikicats"])

    setWikicats = list(set(wikicats))  # removes duplicates
    result["wikicats"] = setWikicats

    # from collections import Counter
    # counts = Counter(wikicats)
    # repetidas = [value for value, count in counts.items() if count > 1]
    #
    # print("\nwikicats repetidas = ", repetidas)
    # print("\nwikicats unicas = ", set(wikicats)-set(repetidas))

    # to return the list of the subjects of the entities identified in the text
    subjects = []
    for entity in entities:
        subjects.extend(entity["subjects"])

    subjects = list(set(subjects))  # removes duplicates
    result["subjects"] = [
        s.split(':')[-1] for s in subjects
    ]  # original format = http://dbpedia.org/resource/Category:Ionian_Revolt, change to Ionian_Revolt

    # to return the list of URIs of the entities identified in the text
    uris = []
    for entity in entities:
        listTypes = entity["combinedTypes"]
        if ("Person" in listTypes):
            uris.append(entity["@URI"])
            continue
        if ("Location" in listTypes) or ("Place" in listTypes) or (
                "City" in listTypes) or ("Country" in listTypes):
            uris.append(entity["@URI"])
            continue
        if "Event" in listTypes:
            uris.append(entity["@URI"])
            continue

    uris = list(set(uris))  # removes duplicates
    result["URIs_persons_places_events"] = uris

    return result
コード例 #13
0
def getContentAfterChanges(sfilename, pfilename):

    finalContent = ""
    finalHTMLContent = ""

    sfile = open(sfilename, 'r')

    if not os.path.isfile(pfilename):
        print("getContentAfterChanges: " + pfilename + " not found!!")
        input("Continue?")
        return (finalContent, finalHTMLContent)

    pfile = open(pfilename, 'rb')

    content = sfile.read()
    dicsEntities = pickle.load(pfile)

    currentPosition = 0  # marks the position in the original file

    offsets = list(dicsEntities["byOffset"].keys())
    if offsets == []:
        return (content, content)

    # new offset for every entity identified in the .w file, it is necessary to correct it wrt the .s as we change the text of the file
    nuevoOffset = 0  # marks the position in the result file
    # new dict byOffset with the offset updates. NECESSARY?? may be it is possible to update directly in the old one
    newByOffset = {}

    # iteration follows the input order in dict, that it is the offset one from low to high
    for i in range(len(offsets)):
        o = offsets[i]
        entity = dicsEntities["byOffset"][o]

        if o != entity["@offset"]:
            _Print(
                o,
                "the offset index is different from the one included in the entity"
            )

        sf = entity["@surfaceForm"]
        nameEntity = entity["entityName"]

        text = content[currentPosition:int(o)]
        currentPosition += len(text)

        finalContent += text
        nuevoOffset += len(text)
        entity["@offset"] = nuevoOffset  # update offset
        # entity["@surfaceForm"] = nameEntity  # no actualizamos la surfaceForm, para conservarla. El ancla en el texto debe ser a partir de ahora entity["entityName"]
        newByOffset[nuevoOffset] = entity  # and save it in the new dict

        finalHTMLContent += text.replace("\n", "\n<br>")

        finalContent += nameEntity  # the entity name is copied in the output file
        nuevoOffset += len(nameEntity)

        # in the HTML file,  write in blue if not modified, and in striked blue and after in green if modified
        if sf == nameEntity:
            finalHTMLContent += "<span style='color: blue'><b>" + nameEntity + "</b></span>"
        else:
            finalHTMLContent += "<span style='color: blue; text-decoration:line-through'>" + sf + "</span> <span style='color: green'><b>" + nameEntity + "</b></span>"

        # Now see how much to advance in the original file

        nameEntitySpaced = nameEntity.replace(
            "_", " ")  # divide the entity name in words

        # if equal,  advance the length
        if sf == nameEntitySpaced:
            currentPosition += len(sf)
        else:
            # if the sf last word is not a prefix of the entity name, continue processing .s file from the end of the sf
            if not nameEntitySpaced.startswith(sf):
                currentPosition += len(sf)
            # if the sf last word is a prefix of the entity name, check if the following chars are in the entity name
            else:
                # nameEntitySpacedRemaining = nameEntitySpaced[len(sf):]   # el resto del nombre de la entidad tras la surface form
                # nextContent = content[currentPosition+len(sf):currentPosition+len(sf)+80]  # lo que viene a partir de la surface form en el fichero original

                # wordsSF = sf.split()
                # if len(wordsSF) > 1:
                # 	leadingSF = " ".join(wordsSF[0:-1])+" "
                # 	finalContent += leadingSF
                # 	currentPosition += len(leadingSF)

                nextContent = content[currentPosition:currentPosition + 80]
                if nextContent.startswith(
                        nameEntitySpaced
                ):  # if the following chars include the name of the entity, we jump it
                    advanceTo = currentPosition + len(nameEntity)
                    if i + 1 < len(offsets):
                        if advanceTo > int(offsets[i + 1]):
                            currentPosition += len(sf)
                        else:
                            currentPosition += len(nameEntity)
                    else:
                        currentPosition += len(sf)
                else:
                    currentPosition += len(sf)

    dicsEntities["byOffset"] = newByOffset  # substitute the new byOffset

    # update byUri and byType from the byOffset
    (nu, nt) = rebuild(newByOffset)

    dicsEntities["byUri"] = nu
    dicsEntities["byType"] = nt
    pickle.dump(dicsEntities, open(sfilename + ".w.p", "wb"))

    return (finalContent, finalHTMLContent)