def getJobsZackExtracted(self, jobFilePath, outputPath=None, to_remove=to_remove, pattrn=pattrn): ''' extracts the most best jobs according to their co-reference in decreasing order of ngram: chief executive officer IS BETTER THAN chief officer IS BETTER THAN officer ''' setOfJobs = set() #count jobs co-reference (counting the (1-4)-gram token words in the job title) ngram_counts = self.getNgram_counts(jobFilePath, to_remove, pattrn) #get best possibility with codecs.open(jobFilePath, 'r', encoding='utf8') as openedFile: for index, jobTitle in enumerate(openedFile): #ORIGINAL extractor bestOption = self.get_best(jobTitle, to_remove, pattrn, ngram_counts) bestOption = self.get_best_modified(jobTitle, to_remove, pattrn, ngram_counts) #add the 'best' job name to the final set if bestOption != "<unk>": setOfJobs.add(bestOption) #dump the output if the output path is specified if outputPath != None: utilsOs.dumpRawLines(setOfJobs, outputPath, addNewline=True, rewrite=True) return setOfJobs
def saveNotFlaggedList(): # save the files path list in an external file filePathList = getFilePathsLists([u'NOT-FLAGGED']) filePathList = [b000path.anonymizePath(p) for p in filePathList] utilsOs.dumpRawLines( filePathList, u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/files.paths' ) return None
def mergeAnnotatedFiles(pathToPrimary, pathOrListOfPathsToSecondary): # get the path to the primary folder def dividePaths(pathAnnotFile): if u'sampleAnnotation.tsv' in pathAnnotFile: pathFolder = pathAnnotFile.replace(u'sampleAnnotation.tsv', u'') else: pathFolder = pathAnnotFile pathAnnotFile = u'{0}sampleAnnotation.tsv'.format(pathAnnotFile) return pathAnnotFile, pathFolder pathToPrimary, primaryFolder = dividePaths(pathToPrimary) # make secondary a list if it is string if type(pathOrListOfPathsToSecondary) is str: pathOrListOfPathsToSecondary = [pathOrListOfPathsToSecondary] # open primary primaryRefPath = u'{0}sampleReference.tsv'.format(primaryFolder) primaryAnnotDf, primaryRefDf = utilsOs.getDataFrameFromArgs(pathToPrimary, primaryRefPath, header=False) primaryEnPath = u'{0}sample.en'.format(primaryFolder) primaryFrPath = u'{0}sample.fr'.format(primaryFolder) primaryEnDf, primaryFrDf = utilsOs.getDataFrameFromArgs(primaryEnPath, primaryFrPath, header=False) # open the secondaries and merge for secondaryPath in pathOrListOfPathsToSecondary: pathToSec, secFolder = dividePaths(secondaryPath) # open secondary dataframe secAnnotDf, secRefDf = utilsOs.getDataFrameFromArgs( pathToSec, u'{0}sampleReference.tsv'.format(secFolder), header=False) secEnDf, secFrDf = utilsOs.getDataFrameFromArgs( u'{0}sample.en'.format(secFolder), u'{0}sample.fr'.format(secFolder), header=False) # concatenate the primary with the secondary primaryAnnotDf = utilsOs.concatenateDfsOrSeries( [primaryAnnotDf, secAnnotDf]) primaryRefDf = utilsOs.concatenateDfsOrSeries([primaryRefDf, secRefDf]) primaryEnDf = utilsOs.concatenateDfsOrSeries([primaryEnDf, secEnDf]) primaryFrDf = utilsOs.concatenateDfsOrSeries([primaryFrDf, secFrDf]) # dump in the primary's path utilsOs.dumpDataFrame(primaryAnnotDf, pathToPrimary, header=False) utilsOs.dumpDataFrame(primaryRefDf, primaryRefPath, header=False) utilsOs.dumpDataFrame(primaryEnDf, primaryEnPath, header=False) utilsOs.dumpDataFrame(primaryFrDf, primaryFrPath, header=False) # bug fix to avoid the 1.0 and 0.0 transforming into 1 and 0 with open(pathToPrimary) as annotFile: annotLines = annotFile.readlines() for aIndex, aLine in enumerate(annotLines): if u'1\n' == aLine: annotLines[aIndex] = aLine.replace(u'1\n', u'1.0\n') elif u'0\n' == aLine: annotLines[aIndex] = aLine.replace(u'0\n', u'0.0\n') utilsOs.dumpRawLines(annotLines, pathToPrimary, addNewline=False)
def randomlyExtractAndDump(extractedSp, extractionSize, subsetName): """ given a dict with all the heuristically extracted """ outputDict = { 0: u'./003negativeNaiveExtractors/numberCoincidence/random100Nb{0}.tsv'. format(subsetName), 1: u'./003negativeNaiveExtractors/fewTokens/random100few{0}.tsv'.format( subsetName), 2: u'./003negativeNaiveExtractors/cognates/random100cog{0}.tsv'.format( subsetName) } for extrType, fileDict in extractedSp.items(): # maintain a census of which index we have already used dejaVu = [] # count the total lines print(u"- EXTRACTION TYPE : ", extrType, u'NUMBER OF FILES : ', len(fileDict)) nbLines = 0 for path, lineList in fileDict.items(): nbLines += len(lineList) print(u'\tNUMBER OF EXTRACTED LINES : ', nbLines) dictPaths = list(fileDict.keys()) # we stop if we achieve our limit while len(dejaVu) < extractionSize: # get the file path index if it's empty then abort rdmFileIndex = getRandomIndex(dictPaths) if rdmFileIndex is None: break # get the list of the lines lineList = fileDict[dictPaths[rdmFileIndex]] rdmLineIndex = getRandomIndex(lineList) # if it's empty, abort if rdmLineIndex is None: break # otherwise while u'{0}\t{1}'.format(dictPaths[rdmFileIndex], rdmLineIndex) in dejaVu: rdmFileIndex = getRandomIndex(dictPaths) lineList = fileDict[dictPaths[rdmFileIndex]] rdmLineIndex = getRandomIndex(lineList) # add to the deja vu dejaVu.append(u'{0}\t{1}'.format(dictPaths[rdmFileIndex], lineList[rdmLineIndex])) # dump utilsOs.dumpRawLines(dejaVu, outputDict[extrType], addNewline=True, rewrite=True) dumpReferenceToLangFiles(dejaVu, outputDict[extrType]) return dejaVu
def getANewSpWhereWeLeftOff(refPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"): # check if the ref file already exists if utilsOs.theFileExists(refPath) is False: utilsOs.createEmptyFile(refPath) # open the reference file lastSeenIndex, lastSeenPath = None, None with open(refPath) as ref: # first line refLns = ref.readlines() refIndex = 0 for refLn in refLns: refList = refLn.replace(u"\n", u"").split(u"\t") # test if we have an index for the path try: lastSeenIndex = int(refList[1]) lastSeenPath = refList[0] break # if there is no integral, then it saw all lns for that path except ValueError: pass # next ref index refIndex += 1 # open the last seen file at the (last seen index + 1) and return the sp in the en and fr files if lastSeenIndex is None: return None with open(u"{0}.en".format(lastSeenPath)) as enFile: with open(u"{0}.fr".format(lastSeenPath)) as frFile: enLn = enFile.readline() frLn = frFile.readline() indexLn = 0 while enLn: if indexLn == lastSeenIndex+1: # replace the line with its next index and dump the ref file refLns[refIndex] = u"{0}\t{1}\n".format(lastSeenPath, indexLn) # return the sentence pair return [enLn.replace(u"\n", u""), frLn.replace(u"\n", u"")], lastSeenPath, indexLn, refLns # next line enLn = enFile.readline() frLn = frFile.readline() indexLn += 1 # if we went over the whole document and it ended, change the ref line, dump it and start over refLns[refIndex] = u"{0}\tdone\n".format(lastSeenPath) utilsOs.dumpRawLines(refLns, refPath, addNewline=False, rewrite=True) return getANewSpWhereWeLeftOff(refPath)
def getReliableJobTitles(self, jobAndPitchFilePath, lang=u'en', outputPath=None, includeJobsWithNApitch=True): ''' Make a set containing the job titles that might be considered more reliable: filter 1: - having less than 3 tokens - not having ampersand (&) or slash (/) signs - not having acronyms filter 2: - present more than once (no hapax) - being the right language (en/fr in both 'job' and 'pitch') ''' with codecs.open(jobAndPitchFilePath, 'r', encoding='utf8') as openedFile: candidatesDict = self.reliableFilter1(openedFile) setOfReliableJobs = self.reliableFilter2(candidatesDict, lang, includeJobsWithNApitch) #dump the output if the output path is specified if outputPath != None: utilsOs.dumpRawLines(setOfReliableJobs, outputPath, addNewline=True, rewrite=True) return setOfReliableJobs
def repairHeuristicsScore(heuristicName, corpus=[ u'ALIGNMENT-QUALITY', u'MISALIGNED', u'QUALITY', u'NOT-FLAGGED' ]): """ rewrite the score in order to correct some problems u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url', u'mono', u'tabl', 'strBcks', 'punct', 'gibb' """ basePath = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/' for name in corpus: scorePath = u'{0}{1}/{2}/score.tsv'.format(basePath, name, heuristicName) with open(scorePath) as scoreFile: scoreLines = scoreFile.readlines() # line by line for lnIndex, scoreLn in enumerate(scoreLines): scoreList = scoreLn.replace(u'\n', u'').split(u'\t') if scoreList[0] != u'na': # change depending on heuristic if heuristicName in [u'url']: smallest = min([int(scoreList[3]), int(scoreList[4])]) greatest = max([int(scoreList[3]), int(scoreList[4])]) if int(scoreList[3]) + int(scoreList[4]) != 0: scoreList[0] = str(float(smallest) / float(greatest)) scoreLines[lnIndex] = u'{0}\n'.format( u'\t'.join(scoreList)) elif heuristicName in [u'mono']: smallest = min([int(scoreList[1]), int(scoreList[2])]) greatest = max([int(scoreList[1]), int(scoreList[2])]) if int(scoreList[1]) + int(scoreList[2]) != 0: scoreList[0] = str(float(smallest) / float(greatest)) scoreLines[lnIndex] = u'{0}\n'.format( u'\t'.join(scoreList)) elif heuristicName in [u'ion']: if int(scoreList[1]) + int(scoreList[2]) <= 2: scoreList[0] = u'na' scoreLines[lnIndex] = u'{0}\n'.format( u'\t'.join(scoreList)) utilsOs.dumpRawLines(scoreLines, scorePath, addNewline=False, rewrite=True)
def modifyConfigAndIndexFiles(pathToTheExportationEnvironment): ''' given the path to the sigma.js exportation environment (ending in the folder "network/"), it changes the config.json file and the index.html file so they show the graph the way intended ''' #copying config.json file configContent = {"type": "network","version": "1.0","data": "data.json","logo": {"file": "","link": "","text": ""},"text": {"more": "","intro": "","title": ""},"legend": {"edgeLabel": "","colorLabel": "","nodeLabel": ""},"features": {"search": True,"groupSelectorAttribute": True,"hoverBehavior": "default"},"informationPanel": {"groupByEdgeDirection": True,"imageAttribute": False},"sigma": {"drawingProperties": {"defaultEdgeType": "curve","defaultHoverLabelBGColor": "#002147","defaultLabelBGColor": "#ddd","activeFontStyle": "bold","defaultLabelColor": "#000","labelThreshold": 999,"defaultLabelHoverColor": "#fff","fontStyle": "bold","hoverFontStyle": "bold","defaultLabelSize": 14},"graphProperties": {"maxEdgeSize": 2,"minEdgeSize": 2,"minNodeSize": 0.25,"maxNodeSize": 2.5},"mouseProperties": {"maxRatio": 20,"minRatio": 0.75}}} pathConfigJson = u'{0}config.json'.format(pathToTheExportationEnvironment) if utilsOs.theFileExists(pathConfigJson) == True: os.remove(pathConfigJson) utilsOs.dumpDictToJsonFile(configContent, pathConfigJson) #getting the color information from the data file colorCommunityDict = {} dataDict = utilsOs.openJsonFileAsDict(u'{0}data.json'.format(pathToTheExportationEnvironment)) for nodeDict in dataDict[u'nodes']: try: if nodeDict[u'attributes'][u'community_lvl_0'] not in colorCommunityDict: colorCommunityDict[nodeDict[u'attributes'][u'community_lvl_0']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name_lvl_0']) ''' ##################################################### #before I changed the names of the columns if nodeDict[u'attributes'][u'community'] not in colorCommunityDict: colorCommunityDict[nodeDict[u'attributes'][u'community']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name']) ''' except KeyError: pass #modifying the index.html file with open(u'{0}index.html'.format(pathToTheExportationEnvironment)) as indexFile: fileLines = indexFile.readlines() for index, line in enumerate(fileLines): if line == u'\t\t<dt class="colours"></dt>\n': indexDivisor = index + 1 break fileLines = fileLines[:indexDivisor] + [u'\t\t<dd>\n'] + list(colorCommunityDict.values()) + [u'\t\t</dd>\n'] + fileLines[indexDivisor+1:] utilsOs.dumpRawLines(fileLines, u'{0}index.html'.format(pathToTheExportationEnvironment), addNewline=False, rewrite=True)
def annotateFiles(listOfFilesPath=None, annotatedOutputFolder=u'./002manuallyAnnotated/', dumpSP=True): """ given a list of paths, manually show and annotate the sentence pairs """ referencePathLine = [] listOfAnnotations = [] # get the list containing the file paths if listOfFilesPath is None: listOfFilesPath = randomlySelectNDocsFromPath( b000path.getBtFolderPath(flagFolder=None), n=100) makeLocalFolderPaths(listOfFilesPath) elif type(listOfFilesPath) is str: if u'.json' in listOfFilesPath: listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath) else: listOfFilesPath = [listOfFilesPath] # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(annotatedOutputFolder)): refLines = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(annotatedOutputFolder), noNewLineChar=True) annotatedFiles = set([line.split(u'\t')[0] for line in refLines]) listOfFilesPath = [ file for file in listOfFilesPath if file not in annotatedFiles ] # print the annotator cheat sheet print(""""0 - badly aligned \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors \n1 - well aligned \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections \n\t1.1 - FOIBLE: imperfection in the translation quality""") # open each file in EN and FR and show it in the terminal for filePath in listOfFilesPath: print(u'############# {0} ##############'.format( filePath.replace( u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u''))) # get the path for the source and target fileSourcePath = u'{0}.fr'.format( filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath) fileTargetPath = u'{0}.en'.format( filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath) with open(fileSourcePath) as fileSource: with open(fileTargetPath) as fileTarget: # show the context of the annotated sentence beforeSentSource = fileSource.readline() duringSentSource = fileSource.readline() beforeSentTarget = fileTarget.readline() duringSentTarget = fileTarget.readline() # annotate the first sentence pair listOfAnnotations = annotateFirstSP(beforeSentSource, duringSentSource, beforeSentTarget, duringSentTarget, listOfAnnotations, lineLength=137) # save the reference # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format(filePath, 0)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[0]) # dump the first SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) duringIndex = 1 # for each line while duringSentSource or duringSentTarget: # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # get the sentences afterSentSource = fileSource.readline() afterSentTarget = fileTarget.readline() # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format( duringSentSource) redDuringTarget = u'\033[1;31m{0}\033[0m'.format( duringSentTarget) # print the sentences print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentTarget)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringTarget)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentTarget)) print() # count if the lines that take the space of 2 lines longLines = getNbLongLines([ beforeSentSource, beforeSentTarget, duringSentSource, duringSentTarget, afterSentSource, afterSentTarget ], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input( u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3', u'1.0', u'1.1', u'c', u'correct' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input( u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # if we still need to specify what type of alignment or misalignment if annotatorGeneralInput in [u'0', u'1']: utilsOs.moveUpAndLeftNLines(1, slowly=False) # get the second part of the annotation (aligned or not) annotatorSpecificInput = input( u'Specific type annotation: ') typeAnswers = [ u'0', u'1', u'2', u'3' ] if annotatorGeneralInput == 0 else [u'0', u'1'] # make sure to have the right specific annotation while True: if annotatorSpecificInput in typeAnswers: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorSpecificInput = input( u'Repeat type annotation: ') # save to the list of annotations listOfAnnotations.append( float(u'{0}.{1}'.format(annotatorGeneralInput, annotatorSpecificInput))) # if the right answer was given in the right format right away else: # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # next line source beforeSentSource = duringSentSource duringSentSource = afterSentSource # next line target beforeSentTarget = duringSentTarget duringSentTarget = afterSentTarget # append the reference to the file # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format( filePath, duringIndex)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[duringIndex]) # add 1 to index duringIndex += 1 # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines(referencePathLine, u'{0}sampleReference.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines(listOfAnnotations, u'{0}sampleAnnotation.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(2, slowly=False)
def launchForOneDay(tokLimit=4000, outputFolderPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/", coffeeBreak=1650): """ launches the deepL bot for one day's worth :param tokLimit: maximum number of tokens to treat in the day :param outputFolderPath: path to the folder where will be output the files :param coffeeBreak: time in seconds when to take a break and start a new deppL session :return: tokCount: number of total tokens translated """ start = utilsOs.countTime() # path to the referencer, indicating where we left off: path and last index worked referencerPath = u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef" # info deepLUrl = u"https://www.deepl.com/translator" mUser, mPass, sUser, sPass = b000path.getDeepLProfileInfo() # for each user for user, passw in zip([sUser, mUser], [sPass, mPass]): tokCount = 0 # open the driver session = webdriver.Firefox() session.get(deepLUrl) time.sleep(random.uniform(1.3, 3.1)) # log to deepL session = authentificateBtUseSelenium(user, passw, session) # while we have not gone over the daily limit iterCount = 0 while tokCount < (tokLimit-10): # get the sp sp, filePath, fileIndex, refLns = getANewSpWhereWeLeftOff(referencerPath) session, nbOfTok, enFrTranslAndAlt, frEnTranslAndAlt, timeEn, timeFr = translateSpGetResult(session, sp) # dump the referencer lines utilsOs.dumpRawLines(refLns, referencerPath, addNewline=False, rewrite=True) # dump original sp utilsOs.appendLineToFile(sp[0], u"{0}originalSent.en".format(outputFolderPath), addNewLine=True) utilsOs.appendLineToFile(sp[1], u"{0}originalSent.fr".format(outputFolderPath), addNewLine=True) # dump translation and variants utilsOs.appendLineToFile(enFrTranslAndAlt, u"{0}translated.en2fr".format(outputFolderPath), addNewLine=True) utilsOs.appendLineToFile(frEnTranslAndAlt, u"{0}translated.fr2en".format(outputFolderPath), addNewLine=True) # dump reference utilsOs.appendLineToFile(u"{0}\t{1}\n".format(filePath, fileIndex), u"{0}reference.tsv".format(outputFolderPath), addNewLine=False) # dump timestamp utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeEn, transformTimeToLocalTime(timeEn)), u"{0}timestamp.en".format(outputFolderPath), addNewLine=True) utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeFr, transformTimeToLocalTime(timeFr)), u"{0}timestamp.fr".format(outputFolderPath), addNewLine=True) # add number of tokens tokCount += nbOfTok # add nb of iterations iterCount += 1 # take a coffee break if it's time if coffeeBreak is not None and utilsOs.countTime(start) >= coffeeBreak: session.close() time.sleep(random.uniform(60, 80)) start = utilsOs.countTime() # open the driver session = webdriver.Firefox() session.get(deepLUrl) time.sleep(random.uniform(1.3, 3.1)) # log to deepL session = authentificateBtUseSelenium(user, passw, session) time.sleep(random.uniform(1.0, 1.5)) # close the driver session.close() time.sleep(random.uniform(10.0, 15.0)) return tokCount, iterCount
def applyNormalisationGetResult(testFilePath, normOutPath=None, ororazeOutput=(True, True), useAbbrDict=False, normalizationFunction=None, *args): ''' if normalizationFunction is none, then it will create the baseline otherwise it will aplly the normalization function, ororaze it and evaluate the output ''' positiveEvalCounter = 0 with open(testFilePath, u'r', encoding=u'utf8') as gsFile: #get total number of comments totalComments = utilsOs.countLines(gsFile) - 1 #create an empty file for the norm if normOutPath != None: normFile = utilsOs.createEmptyFile( normOutPath, headerLine= u'Id\tEvaluation\tErrorTokens\tOriginal\tOutput\tGoldStandard') #create a separate folder for each column origFile = utilsOs.createEmptyFile(normOutPath.replace( u'.tsv', u'1Orig.tsv'), headerLine=u'Id\tOriginal') outFile = utilsOs.createEmptyFile(normOutPath.replace( u'.tsv', u'2Out.tsv'), headerLine=u'Id\tEvaluation\tOutput') goldFile = utilsOs.createEmptyFile(normOutPath.replace( u'.tsv', u'3Gold.tsv'), headerLine=u'Id\tGoldStandard') with open(testFilePath, u'r', encoding=u'utf8') as gsFile: #dispose of the header line header = gsFile.readline() #get first line line = gsFile.readline() #start an empty dejavuDict dejavuDict = {} #count and populate the norm while line: #get data lineList = (line.replace(u'\n', u'')).split(u'\t') commentId, originalComment, goldStandard = lineList normOutput = str(originalComment) #detect french feminin accord and fossilize the word by modifying its structure to something unchanged by the normalization function normOutput = frenchFemininAccordsCodification(originalComment, isInput=True) #apply orora solution to abbreviations if useAbbrDict != False: if useAbbrDict != True: normOutput = ororaZeAbbreviations(normOutput, useAbbrDict) else: normOutput = ororaZeAbbreviations(normOutput) #apply the normalization function if normalizationFunction != None: normOutput, dejavuDict = normalizationFunction( normOutput.lower(), dejavuDict, *args) #reverse back the code for the feminin accord into its original form normOutput = frenchFemininAccordsCodification(normOutput, isInput=False) #get normalized output if ororazeOutput == True: normOutput = ororaZe(normOutput, advanced=True) elif type(ororazeOutput) is tuple or type(ororazeOutput) is list: if ororazeOutput[0] == True: normOutput = ororaZe(normOutput, advanced=ororazeOutput[1]) #evaluation if the normalized output corresponds to the gold standard positiveEvalCounter, evaluation = normalizationEvaluator( normOutput, goldStandard, positiveEvalCounter) #get the tokens that do not correspond exactly and their edit distance errorTokList = utilsString.getcorrespondingTokensAndEditDist( normOutput, goldStandard) if evaluation == 0 else u'na' #dump if normOutPath != None: normFile.write(u'{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n'.format( commentId, evaluation, errorTokList, originalComment, normOutput, goldStandard)) #dump to column separate files origFile.write(u'{0}\t{1}\n'.format(commentId, originalComment)) outFile.write(u'{0}\t{1}\t{2}\t{3}\n'.format( commentId, evaluation, errorTokList, normOutput)) goldFile.write(u'{0}\t{1}\n'.format(commentId, goldStandard)) #next line line = gsFile.readline() #close the norm file if normOutPath != None: normFile.close() #close the other files origFile.close() outFile.close() goldFile.close() #dump the results resultsPath = u'{0}.results'.format(normOutPath.replace(u'.tsv', u'')) utilsOs.dumpRawLines([ u'NORMALIZATION RESULTS', u'exact positives: {0}/{1}'.format( positiveEvalCounter, totalComments), u'ratio: {0}'.format( float(positiveEvalCounter) / float(totalComments)) ], resultsPath) return { u'exact positives': positiveEvalCounter, u'total comments': totalComments, u'ratio': (float(positiveEvalCounter) / float(totalComments)) }
def changeAnnotations(folderPathToReannotate, annotationTochange=[u'0.3', u'1.1']): """ given a path where to find the annotation files, change the annotation (new) for the with a specific annotation (old) """ # transform the annotation into a list if need be if type(annotationTochange) is str: annotationTochange = [annotationTochange] # get the annotation data sentEnList, sentFrList, sentRefList, sentAnnotList = getAnnotationData( folderPathToReannotate) # print the annotator cheat sheet printCheatSheet() # annotate only when we find the problematic old annotation for indexAnnot, oldAnnot in enumerate(list(sentAnnotList)): if oldAnnot in annotationTochange: src = sentEnList[indexAnnot] if u'en-fr' in sentRefList[ indexAnnot] else sentFrList[indexAnnot] trgt = sentFrList[indexAnnot] if u'en-fr' in sentRefList[ indexAnnot] else sentEnList[indexAnnot] print(u'{0} - {1}'.format(indexAnnot + 1, src)) print(u'{0} - {1}'.format(indexAnnot + 1, trgt)) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input( u'Old annotation is {0}, what is the new one: '.format( oldAnnot)) # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1', u'1.2', u'1.3', u'1.4', u'c', u'correction' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input(u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, sentAnnotList = correctionToAnnotation( sentAnnotList) # if we still need to specify what type of alignment or misalignment if annotatorGeneralInput in [u'0', u'1']: utilsOs.moveUpAndLeftNLines(1, slowly=False) # get the second part of the annotation (aligned or not) annotatorSpecificInput = input(u'Specific type annotation: ') typeAnswers = [u'0', u'1', u'2' ] if annotatorGeneralInput == 0 else [ u'0', u'1', u'2', u'3', u'4' ] # make sure to have the right specific annotation while True: if annotatorSpecificInput in typeAnswers: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorSpecificInput = input( u'Repeat type annotation: ') # save to the list of annotations sentAnnotList[indexAnnot] = u'{0}.{1}'.format( annotatorGeneralInput, annotatorSpecificInput) # if the right answer was given in the right format right away else: # save to the list of annotations sentAnnotList[indexAnnot] = str(annotatorGeneralInput) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(3, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(2): print(u' ' * (max([len(src), len(trgt)]) + 6)) utilsOs.moveUpAndLeftNLines(2, slowly=False) # remove format problematic annotations sentAnnotList = [ annot if annot != u'1.1.0' else u'1.1' for annot in sentAnnotList ] sentAnnotList = [ annot if annot != u'0.1.0' else u'0.1' for annot in sentAnnotList ] # dump new annotation sentAnnotPath = u'{0}sampleAnnotation.tsv'.format(folderPathToReannotate) utilsOs.dumpRawLines(sentAnnotList, sentAnnotPath, addNewline=True, rewrite=True)
def annotateFilesAfterHeurAndSelection(inputFolderPath, outputFolderPath, dumpSP=True): """ given a folder path, where the reference, en line and fr line are alreade selected, annotate the SPs """ # add a slash if needed if inputFolderPath[-1] != u'/': inputFolderPath = u'{0}/'.format(inputFolderPath) if outputFolderPath[-1] != u'/': outputFolderPath = u'{0}/'.format(outputFolderPath) # get the selected reference file lines with open(u'{0}sampleReference.Paths'.format( inputFolderPath)) as refPathsFile: referenceLines = refPathsFile.readlines() # get the en and fr input lines with open(u'{0}sample.en'.format(inputFolderPath)) as enFile: enLns = enFile.readlines() with open(u'{0}sample.fr'.format(inputFolderPath)) as frFile: frLns = frFile.readlines() with open(u'{0}scores.tsv'.format(inputFolderPath)) as scFile: scLns = scFile.readlines() # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(outputFolderPath)): # get the already seen lines referencePathLine = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(outputFolderPath), noNewLineChar=True) listOfAnnotations = utilsOs.readAllLinesFromFile( u'{0}sampleAnnotation.tsv'.format(outputFolderPath), noNewLineChar=True) # maintain only what we haven't seen annotatedFiles = set(referencePathLine) newRefLines = [] for ind, file in enumerate(referenceLines): if file.replace(u'\n', u'') not in annotatedFiles: newRefLines.append([ind, file.replace(u'\n', u'')]) referenceLines = newRefLines # print(referenceLines) else: referencePathLine = [] listOfAnnotations = [] referenceLines = [(ind, file.replace(u'\n', u'')) for ind, file in enumerate(referenceLines)] # print the annotator cheat sheet printCheatSheet() # open each file in EN and FR and show it in the terminal for tupleRef in referenceLines: indRef, refLn = tupleRef[0], tupleRef[1] print(u'############# {0} ##############'.format( refLn.replace(u'\n', u''))) # get the path for the source and target lnsSource = enLns if u'en-fr' in refLn else frLns lnsTarget = frLns if u'en-fr' in refLn else enLns # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format(lnsSource[indRef]) # print the sentences print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), lnsTarget[indRef])) print() # count the lines that take the space of 2 lines longLines = getNbLongLines([lnsSource[indRef], lnsTarget[indRef]], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input(u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1', u'1.2', u'1.3', u'1.4', u'c', u'correction' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input(u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False) # append the reference to the file referencePathLine.append(refLn) # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines( referencePathLine, u'{0}sampleReference.tsv'.format(outputFolderPath), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines( listOfAnnotations, u'{0}sampleAnnotation.tsv'.format(outputFolderPath), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = lnsSource[indRef] if u'en-fr' in refLn else lnsTarget[ indRef] frSent = lnsTarget[indRef] if u'en-fr' in refLn else lnsSource[ indRef] utilsOs.appendLineToFile(enSent, u'{0}sample.en'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(frSent, u'{0}sample.fr'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(scLns[indRef], u'{0}scores.tsv'.format(outputFolderPath), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(7, slowly=False)