def tokenDictMakerFromFile(inputFilePath, outputFilePath=None): ''' ###NEED TO ANALYSE IF REMOVE IT AND REPLACE IT WITH makeTokenCountDictFromText() DEFINITELY ###################################################################### takes a corpus file, makes a dict of tokens with their count and dumps the result in a json file VERY SIMILAR TO makeTokenCountDictFromText() BUT MORE HANDS-ON AND SELF-BUILT ''' import utilsOs tokenDict = {} stringList = utilsOs.readAllLinesFromFile(inputFilePath, True) for string in stringList: tokenList = naiveRegexTokenizer(string.replace(u'/', u' ')) for token in tokenList: tokenDict[token] = tokenDict.get(token, 0.0) + (1.0 / len(stringList)) #we also add the lowercase version if there is an uppercase in the token if any(c.isupper() for c in token): tokenDict[token.lower()] = tokenDict.get( token.lower(), 0.0) + (1.0 / len(stringList)) if outputFilePath == None: outputFilePath = utilsOs.safeFilePath( inputFilePath.replace( inputFilePath.split(u'/')[-1], 'tokens.json')) utilsOs.dumpDictToJsonFile(tokenDict, outputFilePath) return tokenDict
def quadrigramDictMakerFromFile(inputFilePath, outputFilePath=None): ''' takes a corpus file, makes a dict of 4grams with their cooccurrence and dumps the result in a json file ''' quadrigramDict = {} stringList = utilsOs.readAllLinesFromFile(inputFilePath, True) langString = u' '.join(stringList) for i in range(len(langString) - 3): quadrigramDict[langString[i:i + 4]] = quadrigramDict.get( langString[i:i + 4], 0.0) + (1.0 / len(stringList)) if outputFilePath == None: outputFilePath = utilsOs.safeFilePath( inputFilePath.replace( inputFilePath.split(u'/')[-1], 'quadrigrams.json')) utilsOs.dumpDictToJsonFile(quadrigramDict, outputFilePath) return quadrigramDict
def trigramDictMakerFromFile(inputFilePath, outputFilePath=None): ''' takes a corpus file, makes a dict of character 3grams with their count and dumps the result in a json file ''' import utilsOs trigramDict = {} stringList = utilsOs.readAllLinesFromFile(inputFilePath, True) langString = u' '.join(stringList) for i in range(len(langString) - 2): trigramDict[langString[i:i + 3]] = trigramDict.get( langString[i:i + 3], 0.0) + (1.0 / len(stringList)) if outputFilePath == None: outputFilePath = utilsOs.safeFilePath( inputFilePath.replace( inputFilePath.split(u'/')[-1], 'trigrams.json')) utilsOs.dumpDictToJsonFile(trigramDict, outputFilePath) return trigramDict
def tokenDictMakerFromFile(inputFilePath, outputFilePath=None): ''' takes a corpus file, makes a dict of tokens with their cooccurrence and dumps the result in a json file ''' tokenDict = {} stringList = utilsOs.readAllLinesFromFile(inputFilePath, True) for string in stringList: tokenList = naiveRegexTokenizer(string.replace(u'/', u' ')) for token in tokenList: tokenDict[token] = tokenDict.get(token, 0.0) + (1.0 / len(stringList)) #we also add the lowercase version if there is an uppercase in the token if any(c.isupper() for c in token): tokenDict[token.lower()] = tokenDict.get( token.lower(), 0.0) + (1.0 / len(stringList)) if outputFilePath == None: outputFilePath = utilsOs.safeFilePath( inputFilePath.replace( inputFilePath.split(u'/')[-1], 'tokens.json')) utilsOs.dumpDictToJsonFile(tokenDict, outputFilePath) return tokenDict
def annotateFiles(listOfFilesPath=None, annotatedOutputFolder=u'./002manuallyAnnotated/', dumpSP=True): """ given a list of paths, manually show and annotate the sentence pairs """ referencePathLine = [] listOfAnnotations = [] # get the list containing the file paths if listOfFilesPath is None: listOfFilesPath = randomlySelectNDocsFromPath( b000path.getBtFolderPath(flagFolder=None), n=100) makeLocalFolderPaths(listOfFilesPath) elif type(listOfFilesPath) is str: if u'.json' in listOfFilesPath: listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath) else: listOfFilesPath = [listOfFilesPath] # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(annotatedOutputFolder)): refLines = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(annotatedOutputFolder), noNewLineChar=True) annotatedFiles = set([line.split(u'\t')[0] for line in refLines]) listOfFilesPath = [ file for file in listOfFilesPath if file not in annotatedFiles ] # print the annotator cheat sheet print(""""0 - badly aligned \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors \n1 - well aligned \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections \n\t1.1 - FOIBLE: imperfection in the translation quality""") # open each file in EN and FR and show it in the terminal for filePath in listOfFilesPath: print(u'############# {0} ##############'.format( filePath.replace( u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u''))) # get the path for the source and target fileSourcePath = u'{0}.fr'.format( filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath) fileTargetPath = u'{0}.en'.format( filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath) with open(fileSourcePath) as fileSource: with open(fileTargetPath) as fileTarget: # show the context of the annotated sentence beforeSentSource = fileSource.readline() duringSentSource = fileSource.readline() beforeSentTarget = fileTarget.readline() duringSentTarget = fileTarget.readline() # annotate the first sentence pair listOfAnnotations = annotateFirstSP(beforeSentSource, duringSentSource, beforeSentTarget, duringSentTarget, listOfAnnotations, lineLength=137) # save the reference # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format(filePath, 0)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[0]) # dump the first SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) duringIndex = 1 # for each line while duringSentSource or duringSentTarget: # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # get the sentences afterSentSource = fileSource.readline() afterSentTarget = fileTarget.readline() # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format( duringSentSource) redDuringTarget = u'\033[1;31m{0}\033[0m'.format( duringSentTarget) # print the sentences print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) - 1, beforeSentTarget)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringTarget)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentSource)) print(u'{0} - {1}'.format( len(listOfAnnotations) + 1, afterSentTarget)) print() # count if the lines that take the space of 2 lines longLines = getNbLongLines([ beforeSentSource, beforeSentTarget, duringSentSource, duringSentTarget, afterSentSource, afterSentTarget ], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input( u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3', u'1.0', u'1.1', u'c', u'correct' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input( u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # if we still need to specify what type of alignment or misalignment if annotatorGeneralInput in [u'0', u'1']: utilsOs.moveUpAndLeftNLines(1, slowly=False) # get the second part of the annotation (aligned or not) annotatorSpecificInput = input( u'Specific type annotation: ') typeAnswers = [ u'0', u'1', u'2', u'3' ] if annotatorGeneralInput == 0 else [u'0', u'1'] # make sure to have the right specific annotation while True: if annotatorSpecificInput in typeAnswers: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorSpecificInput = input( u'Repeat type annotation: ') # save to the list of annotations listOfAnnotations.append( float(u'{0}.{1}'.format(annotatorGeneralInput, annotatorSpecificInput))) # if the right answer was given in the right format right away else: # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False) # next line source beforeSentSource = duringSentSource duringSentSource = afterSentSource # next line target beforeSentTarget = duringSentTarget duringSentTarget = afterSentTarget # append the reference to the file # if the filepath is the reference if u'burtrad' in filePath: referencePathLine.append(u'{0}\t{1}'.format( filePath, duringIndex)) # otherwise we get it from a reference file else: with open(u'{0}.tsv'.format(filePath)) as refFile: refLns = [ ln.replace(u'\n', u'') for ln in refFile.readlines() ] referencePathLine.append(refLns[duringIndex]) # add 1 to index duringIndex += 1 # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines(referencePathLine, u'{0}sampleReference.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines(listOfAnnotations, u'{0}sampleAnnotation.tsv'.format( annotatedOutputFolder), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource utilsOs.appendLineToFile( enSent, u'{0}sample.en'.format(annotatedOutputFolder), addNewLine=False) utilsOs.appendLineToFile( frSent, u'{0}sample.fr'.format(annotatedOutputFolder), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(2, slowly=False)
def annotateFilesAfterHeurAndSelection(inputFolderPath, outputFolderPath, dumpSP=True): """ given a folder path, where the reference, en line and fr line are alreade selected, annotate the SPs """ # add a slash if needed if inputFolderPath[-1] != u'/': inputFolderPath = u'{0}/'.format(inputFolderPath) if outputFolderPath[-1] != u'/': outputFolderPath = u'{0}/'.format(outputFolderPath) # get the selected reference file lines with open(u'{0}sampleReference.Paths'.format( inputFolderPath)) as refPathsFile: referenceLines = refPathsFile.readlines() # get the en and fr input lines with open(u'{0}sample.en'.format(inputFolderPath)) as enFile: enLns = enFile.readlines() with open(u'{0}sample.fr'.format(inputFolderPath)) as frFile: frLns = frFile.readlines() with open(u'{0}scores.tsv'.format(inputFolderPath)) as scFile: scLns = scFile.readlines() # get rid of the files we have already annotated if utilsOs.theFileExists( u'{0}sampleReference.tsv'.format(outputFolderPath)): # get the already seen lines referencePathLine = utilsOs.readAllLinesFromFile( u'{0}sampleReference.tsv'.format(outputFolderPath), noNewLineChar=True) listOfAnnotations = utilsOs.readAllLinesFromFile( u'{0}sampleAnnotation.tsv'.format(outputFolderPath), noNewLineChar=True) # maintain only what we haven't seen annotatedFiles = set(referencePathLine) newRefLines = [] for ind, file in enumerate(referenceLines): if file.replace(u'\n', u'') not in annotatedFiles: newRefLines.append([ind, file.replace(u'\n', u'')]) referenceLines = newRefLines # print(referenceLines) else: referencePathLine = [] listOfAnnotations = [] referenceLines = [(ind, file.replace(u'\n', u'')) for ind, file in enumerate(referenceLines)] # print the annotator cheat sheet printCheatSheet() # open each file in EN and FR and show it in the terminal for tupleRef in referenceLines: indRef, refLn = tupleRef[0], tupleRef[1] print(u'############# {0} ##############'.format( refLn.replace(u'\n', u''))) # get the path for the source and target lnsSource = enLns if u'en-fr' in refLn else frLns lnsTarget = frLns if u'en-fr' in refLn else enLns # get the correct terminal line length lineLength = 137 - len(str(len(listOfAnnotations) + 1)) # color in red the during lines redDuringSource = u'\033[1;31m{0}\033[0m'.format(lnsSource[indRef]) # print the sentences print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource)) print(u'{0} - {1}'.format(len(listOfAnnotations), lnsTarget[indRef])) print() # count the lines that take the space of 2 lines longLines = getNbLongLines([lnsSource[indRef], lnsTarget[indRef]], lineLength) # get the first part of the annotation (aligned or not) annotatorGeneralInput = input(u'Aligned-Misaligned annotation: ') # make sure to have the right general annotation while True: if annotatorGeneralInput in [ u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1', u'1.2', u'1.3', u'1.4', u'c', u'correction' ]: break else: utilsOs.moveUpAndLeftNLines(1, slowly=False) annotatorGeneralInput = input(u'Repeat annotation: ') if annotatorGeneralInput in [u'c', u'correct']: annotatorGeneralInput, listOfAnnotations = correctionToAnnotation( listOfAnnotations) # save to the list of annotations listOfAnnotations.append(float(annotatorGeneralInput)) # remove the lines from the terminal before getting to the next pair utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False) # erase all remainder of the previous sentences and go back up again for e in range(14 + longLines): print(u' ' * (lineLength + 4)) utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False) # append the reference to the file referencePathLine.append(refLn) # dump the file line by line, to be sure in case of error # dump the reference utilsOs.dumpRawLines( referencePathLine, u'{0}sampleReference.tsv'.format(outputFolderPath), addNewline=True, rewrite=True) # dump the annotation utilsOs.dumpRawLines( listOfAnnotations, u'{0}sampleAnnotation.tsv'.format(outputFolderPath), addNewline=True, rewrite=True) # dump the SP if dumpSP is True: enSent = lnsSource[indRef] if u'en-fr' in refLn else lnsTarget[ indRef] frSent = lnsTarget[indRef] if u'en-fr' in refLn else lnsSource[ indRef] utilsOs.appendLineToFile(enSent, u'{0}sample.en'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(frSent, u'{0}sample.fr'.format(outputFolderPath), addNewLine=False) utilsOs.appendLineToFile(scLns[indRef], u'{0}scores.tsv'.format(outputFolderPath), addNewLine=False) # clear part of terminal utilsOs.moveUpAndLeftNLines(7, slowly=False)