Esempio n. 1
0
def tokenDictMakerFromFile(inputFilePath, outputFilePath=None):
    '''
	###NEED TO ANALYSE IF REMOVE IT AND REPLACE IT WITH makeTokenCountDictFromText() DEFINITELY
	######################################################################
	takes a corpus file, makes a dict of tokens with their count
	and dumps the result in a json file
	VERY SIMILAR TO makeTokenCountDictFromText() BUT MORE HANDS-ON AND SELF-BUILT
	'''
    import utilsOs
    tokenDict = {}
    stringList = utilsOs.readAllLinesFromFile(inputFilePath, True)
    for string in stringList:
        tokenList = naiveRegexTokenizer(string.replace(u'/', u' '))
        for token in tokenList:
            tokenDict[token] = tokenDict.get(token,
                                             0.0) + (1.0 / len(stringList))
            #we also add the lowercase version if there is an uppercase in the token
            if any(c.isupper() for c in token):
                tokenDict[token.lower()] = tokenDict.get(
                    token.lower(), 0.0) + (1.0 / len(stringList))
    if outputFilePath == None:
        outputFilePath = utilsOs.safeFilePath(
            inputFilePath.replace(
                inputFilePath.split(u'/')[-1], 'tokens.json'))
    utilsOs.dumpDictToJsonFile(tokenDict, outputFilePath)
    return tokenDict
Esempio n. 2
0
def quadrigramDictMakerFromFile(inputFilePath, outputFilePath=None):
    '''
	takes a corpus file, makes a dict of 4grams with their cooccurrence
	and dumps the result in a json file
	'''
    quadrigramDict = {}
    stringList = utilsOs.readAllLinesFromFile(inputFilePath, True)
    langString = u' '.join(stringList)
    for i in range(len(langString) - 3):
        quadrigramDict[langString[i:i + 4]] = quadrigramDict.get(
            langString[i:i + 4], 0.0) + (1.0 / len(stringList))
    if outputFilePath == None:
        outputFilePath = utilsOs.safeFilePath(
            inputFilePath.replace(
                inputFilePath.split(u'/')[-1], 'quadrigrams.json'))
    utilsOs.dumpDictToJsonFile(quadrigramDict, outputFilePath)
    return quadrigramDict
Esempio n. 3
0
def trigramDictMakerFromFile(inputFilePath, outputFilePath=None):
    '''
	takes a corpus file, makes a dict of character 3grams with their count
	and dumps the result in a json file
	'''
    import utilsOs
    trigramDict = {}
    stringList = utilsOs.readAllLinesFromFile(inputFilePath, True)
    langString = u' '.join(stringList)
    for i in range(len(langString) - 2):
        trigramDict[langString[i:i + 3]] = trigramDict.get(
            langString[i:i + 3], 0.0) + (1.0 / len(stringList))
    if outputFilePath == None:
        outputFilePath = utilsOs.safeFilePath(
            inputFilePath.replace(
                inputFilePath.split(u'/')[-1], 'trigrams.json'))
    utilsOs.dumpDictToJsonFile(trigramDict, outputFilePath)
    return trigramDict
Esempio n. 4
0
def tokenDictMakerFromFile(inputFilePath, outputFilePath=None):
    '''
	takes a corpus file, makes a dict of tokens with their cooccurrence
	and dumps the result in a json file
	'''
    tokenDict = {}
    stringList = utilsOs.readAllLinesFromFile(inputFilePath, True)
    for string in stringList:
        tokenList = naiveRegexTokenizer(string.replace(u'/', u' '))
        for token in tokenList:
            tokenDict[token] = tokenDict.get(token,
                                             0.0) + (1.0 / len(stringList))
            #we also add the lowercase version if there is an uppercase in the token
            if any(c.isupper() for c in token):
                tokenDict[token.lower()] = tokenDict.get(
                    token.lower(), 0.0) + (1.0 / len(stringList))
    if outputFilePath == None:
        outputFilePath = utilsOs.safeFilePath(
            inputFilePath.replace(
                inputFilePath.split(u'/')[-1], 'tokens.json'))
    utilsOs.dumpDictToJsonFile(tokenDict, outputFilePath)
    return tokenDict
def annotateFiles(listOfFilesPath=None,
                  annotatedOutputFolder=u'./002manuallyAnnotated/',
                  dumpSP=True):
    """ given a list of paths, manually show and annotate the sentence pairs """
    referencePathLine = []
    listOfAnnotations = []
    # get the list containing the file paths
    if listOfFilesPath is None:
        listOfFilesPath = randomlySelectNDocsFromPath(
            b000path.getBtFolderPath(flagFolder=None), n=100)
        makeLocalFolderPaths(listOfFilesPath)
    elif type(listOfFilesPath) is str:
        if u'.json' in listOfFilesPath:
            listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath)
        else:
            listOfFilesPath = [listOfFilesPath]
    # get rid of the files we have already annotated
    if utilsOs.theFileExists(
            u'{0}sampleReference.tsv'.format(annotatedOutputFolder)):
        refLines = utilsOs.readAllLinesFromFile(
            u'{0}sampleReference.tsv'.format(annotatedOutputFolder),
            noNewLineChar=True)
        annotatedFiles = set([line.split(u'\t')[0] for line in refLines])
        listOfFilesPath = [
            file for file in listOfFilesPath if file not in annotatedFiles
        ]
    # print the annotator cheat sheet
    print(""""0 - badly aligned
        \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy
        \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography
        \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines
        \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors
        \n1 - well aligned
        \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections 
        \n\t1.1 - FOIBLE: imperfection in the translation quality""")
    # open each file in EN and FR and show it in the terminal
    for filePath in listOfFilesPath:
        print(u'############# {0} ##############'.format(
            filePath.replace(
                u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u'')))
        # get the path for the source and target
        fileSourcePath = u'{0}.fr'.format(
            filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath)
        fileTargetPath = u'{0}.en'.format(
            filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath)
        with open(fileSourcePath) as fileSource:
            with open(fileTargetPath) as fileTarget:
                # show the context of the annotated sentence
                beforeSentSource = fileSource.readline()
                duringSentSource = fileSource.readline()
                beforeSentTarget = fileTarget.readline()
                duringSentTarget = fileTarget.readline()
                # annotate the first sentence pair
                listOfAnnotations = annotateFirstSP(beforeSentSource,
                                                    duringSentSource,
                                                    beforeSentTarget,
                                                    duringSentTarget,
                                                    listOfAnnotations,
                                                    lineLength=137)
                # save the reference
                # if the filepath is the reference
                if u'burtrad' in filePath:
                    referencePathLine.append(u'{0}\t{1}'.format(filePath, 0))
                # otherwise we get it from a reference file
                else:
                    with open(u'{0}.tsv'.format(filePath)) as refFile:
                        refLns = [
                            ln.replace(u'\n', u'')
                            for ln in refFile.readlines()
                        ]
                    referencePathLine.append(refLns[0])
                # dump the first SP
                if dumpSP is True:
                    enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget
                    frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource
                    utilsOs.appendLineToFile(
                        enSent,
                        u'{0}sample.en'.format(annotatedOutputFolder),
                        addNewLine=False)
                    utilsOs.appendLineToFile(
                        frSent,
                        u'{0}sample.fr'.format(annotatedOutputFolder),
                        addNewLine=False)
                duringIndex = 1
                # for each line
                while duringSentSource or duringSentTarget:
                    # get the correct terminal line length
                    lineLength = 137 - len(str(len(listOfAnnotations) + 1))
                    # get the sentences
                    afterSentSource = fileSource.readline()
                    afterSentTarget = fileTarget.readline()
                    # color in red the during lines
                    redDuringSource = u'\033[1;31m{0}\033[0m'.format(
                        duringSentSource)
                    redDuringTarget = u'\033[1;31m{0}\033[0m'.format(
                        duringSentTarget)
                    # print the sentences
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) - 1, beforeSentSource))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) - 1, beforeSentTarget))
                    print(u'{0} - {1}'.format(len(listOfAnnotations),
                                              redDuringSource))
                    print(u'{0} - {1}'.format(len(listOfAnnotations),
                                              redDuringTarget))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) + 1, afterSentSource))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) + 1, afterSentTarget))
                    print()
                    # count if the lines that take the space of 2 lines
                    longLines = getNbLongLines([
                        beforeSentSource, beforeSentTarget, duringSentSource,
                        duringSentTarget, afterSentSource, afterSentTarget
                    ], lineLength)
                    # get the first part of the annotation (aligned or not)
                    annotatorGeneralInput = input(
                        u'Aligned-Misaligned annotation: ')
                    # make sure to have the right general annotation
                    while True:
                        if annotatorGeneralInput in [
                                u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3',
                                u'1.0', u'1.1', u'c', u'correct'
                        ]:
                            break
                        else:
                            utilsOs.moveUpAndLeftNLines(1, slowly=False)
                            annotatorGeneralInput = input(
                                u'Repeat annotation: ')
                    if annotatorGeneralInput in [u'c', u'correct']:
                        annotatorGeneralInput, listOfAnnotations = correctionToAnnotation(
                            listOfAnnotations)
                    # if we still need to specify what type of alignment or misalignment
                    if annotatorGeneralInput in [u'0', u'1']:
                        utilsOs.moveUpAndLeftNLines(1, slowly=False)
                        # get the second part of the annotation (aligned or not)
                        annotatorSpecificInput = input(
                            u'Specific type annotation: ')
                        typeAnswers = [
                            u'0', u'1', u'2', u'3'
                        ] if annotatorGeneralInput == 0 else [u'0', u'1']
                        # make sure to have the right specific annotation
                        while True:
                            if annotatorSpecificInput in typeAnswers:
                                break
                            else:
                                utilsOs.moveUpAndLeftNLines(1, slowly=False)
                                annotatorSpecificInput = input(
                                    u'Repeat type annotation: ')
                        # save to the list of annotations
                        listOfAnnotations.append(
                            float(u'{0}.{1}'.format(annotatorGeneralInput,
                                                    annotatorSpecificInput)))
                    # if the right answer was given in the right format right away
                    else:
                        # save to the list of annotations
                        listOfAnnotations.append(float(annotatorGeneralInput))
                    # remove the lines from the terminal before getting to the next pair
                    utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False)
                    # erase all remainder of the previous sentences and go back up again
                    for e in range(14 + longLines):
                        print(u' ' * (lineLength + 4))
                    utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False)
                    # next line source
                    beforeSentSource = duringSentSource
                    duringSentSource = afterSentSource
                    # next line target
                    beforeSentTarget = duringSentTarget
                    duringSentTarget = afterSentTarget
                    # append the reference to the file
                    # if the filepath is the reference
                    if u'burtrad' in filePath:
                        referencePathLine.append(u'{0}\t{1}'.format(
                            filePath, duringIndex))
                    # otherwise we get it from a reference file
                    else:
                        with open(u'{0}.tsv'.format(filePath)) as refFile:
                            refLns = [
                                ln.replace(u'\n', u'')
                                for ln in refFile.readlines()
                            ]
                        referencePathLine.append(refLns[duringIndex])
                    # add 1 to index
                    duringIndex += 1
                    # dump the file line by line, to be sure in case of error
                    # dump the reference
                    utilsOs.dumpRawLines(referencePathLine,
                                         u'{0}sampleReference.tsv'.format(
                                             annotatedOutputFolder),
                                         addNewline=True,
                                         rewrite=True)
                    # dump the annotation
                    utilsOs.dumpRawLines(listOfAnnotations,
                                         u'{0}sampleAnnotation.tsv'.format(
                                             annotatedOutputFolder),
                                         addNewline=True,
                                         rewrite=True)
                    # dump the SP
                    if dumpSP is True:
                        enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget
                        frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource
                        utilsOs.appendLineToFile(
                            enSent,
                            u'{0}sample.en'.format(annotatedOutputFolder),
                            addNewLine=False)
                        utilsOs.appendLineToFile(
                            frSent,
                            u'{0}sample.fr'.format(annotatedOutputFolder),
                            addNewLine=False)
        # clear part of terminal
        utilsOs.moveUpAndLeftNLines(2, slowly=False)
def annotateFilesAfterHeurAndSelection(inputFolderPath,
                                       outputFolderPath,
                                       dumpSP=True):
    """ given a folder path, where the reference, en line and fr line are alreade selected, annotate the SPs """
    # add a slash if needed
    if inputFolderPath[-1] != u'/':
        inputFolderPath = u'{0}/'.format(inputFolderPath)
    if outputFolderPath[-1] != u'/':
        outputFolderPath = u'{0}/'.format(outputFolderPath)
    # get the selected reference file lines
    with open(u'{0}sampleReference.Paths'.format(
            inputFolderPath)) as refPathsFile:
        referenceLines = refPathsFile.readlines()
    # get the en and fr input lines
    with open(u'{0}sample.en'.format(inputFolderPath)) as enFile:
        enLns = enFile.readlines()
    with open(u'{0}sample.fr'.format(inputFolderPath)) as frFile:
        frLns = frFile.readlines()
    with open(u'{0}scores.tsv'.format(inputFolderPath)) as scFile:
        scLns = scFile.readlines()
    # get rid of the files we have already annotated
    if utilsOs.theFileExists(
            u'{0}sampleReference.tsv'.format(outputFolderPath)):
        # get the already seen lines
        referencePathLine = utilsOs.readAllLinesFromFile(
            u'{0}sampleReference.tsv'.format(outputFolderPath),
            noNewLineChar=True)
        listOfAnnotations = utilsOs.readAllLinesFromFile(
            u'{0}sampleAnnotation.tsv'.format(outputFolderPath),
            noNewLineChar=True)
        # maintain only what we haven't seen
        annotatedFiles = set(referencePathLine)
        newRefLines = []
        for ind, file in enumerate(referenceLines):
            if file.replace(u'\n', u'') not in annotatedFiles:
                newRefLines.append([ind, file.replace(u'\n', u'')])
        referenceLines = newRefLines
        # print(referenceLines)
    else:
        referencePathLine = []
        listOfAnnotations = []
        referenceLines = [(ind, file.replace(u'\n', u''))
                          for ind, file in enumerate(referenceLines)]
    # print the annotator cheat sheet
    printCheatSheet()
    # open each file in EN and FR and show it in the terminal
    for tupleRef in referenceLines:
        indRef, refLn = tupleRef[0], tupleRef[1]
        print(u'############# {0} ##############'.format(
            refLn.replace(u'\n', u'')))
        # get the path for the source and target
        lnsSource = enLns if u'en-fr' in refLn else frLns
        lnsTarget = frLns if u'en-fr' in refLn else enLns
        # get the correct terminal line length
        lineLength = 137 - len(str(len(listOfAnnotations) + 1))
        # color in red the during lines
        redDuringSource = u'\033[1;31m{0}\033[0m'.format(lnsSource[indRef])
        # print the sentences
        print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource))
        print(u'{0} - {1}'.format(len(listOfAnnotations), lnsTarget[indRef]))
        print()
        # count the lines that take the space of 2 lines
        longLines = getNbLongLines([lnsSource[indRef], lnsTarget[indRef]],
                                   lineLength)
        # get the first part of the annotation (aligned or not)
        annotatorGeneralInput = input(u'Aligned-Misaligned annotation: ')
        # make sure to have the right general annotation
        while True:
            if annotatorGeneralInput in [
                    u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1', u'1.2',
                    u'1.3', u'1.4', u'c', u'correction'
            ]:
                break
            else:
                utilsOs.moveUpAndLeftNLines(1, slowly=False)
                annotatorGeneralInput = input(u'Repeat annotation: ')
        if annotatorGeneralInput in [u'c', u'correct']:
            annotatorGeneralInput, listOfAnnotations = correctionToAnnotation(
                listOfAnnotations)
        # save to the list of annotations
        listOfAnnotations.append(float(annotatorGeneralInput))
        # remove the lines from the terminal before getting to the next pair
        utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False)
        # erase all remainder of the previous sentences and go back up again
        for e in range(14 + longLines):
            print(u' ' * (lineLength + 4))
        utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False)
        # append the reference to the file
        referencePathLine.append(refLn)
        # dump the file line by line, to be sure in case of error
        # dump the reference
        utilsOs.dumpRawLines(
            referencePathLine,
            u'{0}sampleReference.tsv'.format(outputFolderPath),
            addNewline=True,
            rewrite=True)
        # dump the annotation
        utilsOs.dumpRawLines(
            listOfAnnotations,
            u'{0}sampleAnnotation.tsv'.format(outputFolderPath),
            addNewline=True,
            rewrite=True)
        # dump the SP
        if dumpSP is True:
            enSent = lnsSource[indRef] if u'en-fr' in refLn else lnsTarget[
                indRef]
            frSent = lnsTarget[indRef] if u'en-fr' in refLn else lnsSource[
                indRef]
            utilsOs.appendLineToFile(enSent,
                                     u'{0}sample.en'.format(outputFolderPath),
                                     addNewLine=False)
            utilsOs.appendLineToFile(frSent,
                                     u'{0}sample.fr'.format(outputFolderPath),
                                     addNewLine=False)
            utilsOs.appendLineToFile(scLns[indRef],
                                     u'{0}scores.tsv'.format(outputFolderPath),
                                     addNewLine=False)
        # clear part of terminal
        utilsOs.moveUpAndLeftNLines(7, slowly=False)