Beispiel #1
0
def appendToDumpInGizaFormat(pathToEnFile, pathToFrFile, outPutPath, tokEnDict,
                             tokFrDict, spFreqDict):
    with open(pathToEnFile) as enFile:
        with open(pathToFrFile) as frFile:
            enLn = enFile.readline()
            frLn = frFile.readline()
            while enLn:
                enLn = enLn.replace(u"\n", u"")
                frLn = frLn.replace(u"\n", u"")
                # get the freq of the sp
                spFreq = spFreqDict[u"{0}***---***{1}".format(enLn, frLn)]
                # get the sp in the form os id codes
                enIdString = transformStringToGizaFormat(
                    enLn, tokEnDict, u"en", pathToEnFile)
                frIdString = transformStringToGizaFormat(
                    frLn, tokFrDict, u"fr", pathToFrFile)
                # dump
                stringLine = u"{0}\n{1}\n{2}".format(spFreq, enIdString,
                                                     frIdString)
                utilsOs.appendLineToFile(stringLine,
                                         outPutPath,
                                         addNewLine=True)
                # next line
                enLn = enFile.readline()
                frLn = frFile.readline()
def delEmptyLinesAndDump(inPath, outPath):
    with open(u'{0}extracted.fr'.format(inPath)) as ff:
        with open(u'{0}extracted.en'.format(inPath)) as ef:
            with open(u'{0}reference.tsv'.format(inPath)) as rf:
                with open(u'{0}scores.tsv'.format(inPath)) as sf:
                    frLn = ff.readline()
                    enLn = ef.readline()
                    refLn = rf.readline()
                    scLn = sf.readline()
                    while frLn:
                        copyFrLn = frLn.replace(u'\n', u'').replace(u'\t', u'').replace(u' ', u'')
                        copyEnLn = enLn.replace(u'\n', u'').replace(u'\t', u'').replace(u' ', u'')
                        if copyFrLn == u'' or copyEnLn == u'':
                            pass
                        else:
                            utilsOs.appendLineToFile(frLn, u'{0}extracted.fr'.format(outPath), addNewLine=False)
                            utilsOs.appendLineToFile(enLn, u'{0}extracted.en'.format(outPath), addNewLine=False)
                            utilsOs.appendLineToFile(refLn, u'{0}reference.tsv'.format(outPath), addNewLine=False)
                            utilsOs.appendLineToFile(scLn, u'{0}scores.tsv'.format(outPath), addNewLine=False)
                        # next line
                        frLn = ff.readline()
                        enLn = ef.readline()
                        refLn = rf.readline()
                        scLn = sf.readline()
    return None
def randomSPselectionForAnnotation(enPath, frPath, refPath, scPath, outputFolderPath, nbSp=150):
    """ given a path to the tsv files in english, french and reference (probably where the heur. were applied),
    selects randomly and extracts to an output folder, ready to be annotated """
    dejavus = set([])
    if outputFolderPath[-1] != u'/':
        outputFolderPath = u'{0}/'.format(outputFolderPath)
    # open the output Files, overwrite previous if it already exists
    utilsOs.deleteAFile(u'{0}sample.en'.format(outputFolderPath))
    utilsOs.deleteAFile(u'{0}sample.fr'.format(outputFolderPath))
    utilsOs.deleteAFile(u'{0}sampleReference.Paths'.format(outputFolderPath))
    utilsOs.deleteAFile(u'{0}scores.tsv'.format(outputFolderPath))
    # get the reference lines
    with open(refPath) as refFile:
        refLns = refFile.readlines()
        lengthRef = len(refLns)
        refLns = None
    for n in range(nbSp):
        # select a random index that is not yet in dejavus
        rdmInd = getQuasiRandomIndexForcingOnSpecificRange(lengthRef, rangeMin=0, rangeMax=200000)
        while rdmInd in dejavus:
            rdmInd = getQuasiRandomIndexForcingOnSpecificRange(lengthRef, rangeMin=0, rangeMax=200000)
        dejavus.add(rdmInd)
        # search for that index in the en files
        refLn, enLn, frLn, scLn = getEnFrLnsForIndex(rdmInd, refPath, enPath, frPath, scPath)
        # dump in the output folder path
        utilsOs.appendLineToFile(enLn, u'{0}sample.en'.format(outputFolderPath), addNewLine=False)
        utilsOs.appendLineToFile(frLn, u'{0}sample.fr'.format(outputFolderPath), addNewLine=False)
        utilsOs.appendLineToFile(refLn, u'{0}sampleReference.Paths'.format(outputFolderPath), addNewLine=False)
        utilsOs.appendLineToFile(scLn, u'{0}scores.tsv'.format(outputFolderPath), addNewLine=False)
    return None
def changeStructure():
    annotationFiles = utilsOs.goDeepGetFiles(
        u'./002manuallyAnnotated/oldOnes/MISALIGNED/', format=u'.tmx')
    for annotationPath in annotationFiles:
        origPath = u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/' + annotationPath.split(
            u'MISALIGNED/')[-1]
        srcPath = origPath + u'.en'
        trgtPath = origPath + u'.fr'
        with open(annotationPath) as file:
            fileLines = file.readlines()
        with open(srcPath) as src:
            srcLines = src.readlines()
        with open(trgtPath) as trgt:
            trgtLines = trgt.readlines()
        for i, anot in enumerate(fileLines):
            srcLn = srcLines[i]
            tgrtLn = trgtLines[i]
            # dump the reference
            referencePathLine = u'{0}\t{1}\n'.format(origPath, i)
            utilsOs.appendLineToFile(
                referencePathLine,
                u'./002manuallyAnnotated/sampleReference.tsv',
                addNewLine=False)
            # dump the annotation
            utilsOs.appendLineToFile(
                anot,
                u'./002manuallyAnnotated/sampleAnnotation.tsv',
                addNewLine=False)
            # dump the SP
            utilsOs.appendLineToFile(srcLn,
                                     u'./002manuallyAnnotated/sampleEn.tsv',
                                     addNewLine=False)
            utilsOs.appendLineToFile(tgrtLn,
                                     u'./002manuallyAnnotated/sampleFr.tsv',
                                     addNewLine=False)
def dumpReferenceToLangFiles(listOfRef, outputGeneralFilePath):
    """ given a lof of the references (original path, line index)
    dump the original lines into lang separated files """
    outputGeneralFilePath = outputGeneralFilePath.replace(u'.tsv', u'')
    enOutputPath = u'{0}.en'.format(outputGeneralFilePath)
    frOutputPath = u'{0}.fr'.format(outputGeneralFilePath)
    # open each ref and get each line by lang
    for ref in listOfRef:
        pathIndex = ref.split(u'\t')
        enPath = u'{0}.en'.format(pathIndex[0])
        frPath = u'{0}.fr'.format(pathIndex[0])
        with open(enPath) as enFile:
            enLines = [line.replace(u'\n', u'') for line in enFile.readlines()]
        with open(frPath) as frFile:
            frLines = [line.replace(u'\n', u'') for line in frFile.readlines()]
        enLine = enLines[int(pathIndex[1])]
        frLine = frLines[int(pathIndex[1])]
        utilsOs.appendLineToFile(enLine, enOutputPath, addNewLine=True)
        utilsOs.appendLineToFile(frLine, frOutputPath, addNewLine=True)
Beispiel #6
0
def reformatFilesPreGiza(pathToEnFile, pathToFrFile, overwrite=True):
    """
    make 2 vocabulary files (occurrence dict) in the format needed by giza++ or mgiza++
    then reformats the corpus into a the format needed by giza++ or mgiza++
    :param pathToEnFile: path to the english sentences file
    :param pathToFrFile: path to the french sentences file
    :return: None
    """
    # prepare the output paths
    outputEnPath = prepareOutPutFile(pathToEnFile, fileName=u"sourceEn.vcb")
    outputFrPath = prepareOutPutFile(pathToFrFile, fileName=u"targetFr.vcb")
    outputPathGizaFormatCorpus = prepareOutPutFile(
        pathToEnFile, fileName=u"sentenceFile.giza")
    outputEnDictPath = prepareOutPutFile(pathToEnFile, fileName=u"en.json")
    outputFrDictPath = prepareOutPutFile(pathToEnFile, fileName=u"fr.json")
    outputSpDictPath = prepareOutPutFile(pathToEnFile, fileName=u"sp.json")
    outputEnIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"enId.json")
    outputFrIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"frId.json")
    # if there is not a file there yet, open the corpus Files, count the frequency of each token
    if overwrite is True or os.path.isfile(outputEnDictPath) is False:
        # make the frequency dict
        enTokFreqDict = makeFreqDict(pathToEnFile, lang=u"en")
        frTokFreqDict = makeFreqDict(pathToFrFile, lang=u"fr")
        # open the corpus files count the frequency of the sentence pairs
        spFreqDict = makeSPfreqDict(pathToEnFile, pathToFrFile)
        # sort the dict by freq
        orderedKeysValuesEn = sorted(enTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        orderedKeysValuesFr = sorted(frTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        # make the id dict
        enIdDict = makeIdDict(orderedKeysValuesEn)
        frIdDict = makeIdDict(orderedKeysValuesFr)
        # dump dicts
        utilsOs.dumpDictToJsonFile(enTokFreqDict, outputEnDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(frTokFreqDict, outputFrDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(spFreqDict, outputSpDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(enIdDict, outputEnIdDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(frIdDict, outputFrIdDictPath, overwrite)
    # if the file already exists or if overwrite is false
    else:
        enTokFreqDict = utilsOs.openJsonFileAsDict(outputEnDictPath)
        frTokFreqDict = utilsOs.openJsonFileAsDict(outputFrDictPath)
        spFreqDict = utilsOs.openJsonFileAsDict(outputSpDictPath)
        enIdDict = utilsOs.openJsonFileAsDict(outputEnIdDictPath)
        frIdDict = utilsOs.openJsonFileAsDict(outputFrIdDictPath)
        # sort the dict by freq
        orderedKeysValuesEn = sorted(enTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        orderedKeysValuesFr = sorted(frTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
    # dump the empty tok voc file
    if overwrite is True:
        firstLine = u"1\tUNK\t0"
        utilsOs.createEmptyFile(outputEnPath, headerLine=firstLine)
        utilsOs.createEmptyFile(outputFrPath, headerLine=firstLine)
        utilsOs.createEmptyFile(outputPathGizaFormatCorpus)
    # dump the dict in the tok voc file
    for indKv, kv in enumerate(orderedKeysValuesEn):
        stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1])
        utilsOs.appendLineToFile(stringLine, outputEnPath, addNewLine=True)
    for indKv, kv in enumerate(orderedKeysValuesFr):
        stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1])
        utilsOs.appendLineToFile(stringLine, outputFrPath, addNewLine=True)
    # transform and dump the corpus into the GIZA format
    appendToDumpInGizaFormat(pathToEnFile, pathToFrFile,
                             outputPathGizaFormatCorpus, enIdDict, frIdDict,
                             spFreqDict)
    return outputEnPath, outputFrPath, outputPathGizaFormatCorpus, outputEnDictPath, outputFrDictPath, outputSpDictPath
def annotateFiles(listOfFilesPath=None,
                  annotatedOutputFolder=u'./002manuallyAnnotated/',
                  dumpSP=True):
    """ given a list of paths, manually show and annotate the sentence pairs """
    referencePathLine = []
    listOfAnnotations = []
    # get the list containing the file paths
    if listOfFilesPath is None:
        listOfFilesPath = randomlySelectNDocsFromPath(
            b000path.getBtFolderPath(flagFolder=None), n=100)
        makeLocalFolderPaths(listOfFilesPath)
    elif type(listOfFilesPath) is str:
        if u'.json' in listOfFilesPath:
            listOfFilesPath = utilsOs.openJsonFileAsDict(listOfFilesPath)
        else:
            listOfFilesPath = [listOfFilesPath]
    # get rid of the files we have already annotated
    if utilsOs.theFileExists(
            u'{0}sampleReference.tsv'.format(annotatedOutputFolder)):
        refLines = utilsOs.readAllLinesFromFile(
            u'{0}sampleReference.tsv'.format(annotatedOutputFolder),
            noNewLineChar=True)
        annotatedFiles = set([line.split(u'\t')[0] for line in refLines])
        listOfFilesPath = [
            file for file in listOfFilesPath if file not in annotatedFiles
        ]
    # print the annotator cheat sheet
    print(""""0 - badly aligned
        \n\t0.0 - AMPLIFICATION: compensation, description, repetition or lang tendency to hypergraphy
        \n\t0.1 - ELISION: absence, omission, reduction or lang tendency to micrography
        \n\t0.2 - DISPLACEMENT: modification of the line order also modifying the order of the following lines
        \n\t0.3 - MISALIGNED and FOIBLE: alignment and quality errors
        \n1 - well aligned
        \n\t1.0 - ALIGNED and GOOD QUALITY: is aligned and shows no evident sing of translation imperfections 
        \n\t1.1 - FOIBLE: imperfection in the translation quality""")
    # open each file in EN and FR and show it in the terminal
    for filePath in listOfFilesPath:
        print(u'############# {0} ##############'.format(
            filePath.replace(
                u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u'')))
        # get the path for the source and target
        fileSourcePath = u'{0}.fr'.format(
            filePath) if u'fr-en' in filePath else u'{0}.en'.format(filePath)
        fileTargetPath = u'{0}.en'.format(
            filePath) if u'fr-en' in filePath else u'{0}.fr'.format(filePath)
        with open(fileSourcePath) as fileSource:
            with open(fileTargetPath) as fileTarget:
                # show the context of the annotated sentence
                beforeSentSource = fileSource.readline()
                duringSentSource = fileSource.readline()
                beforeSentTarget = fileTarget.readline()
                duringSentTarget = fileTarget.readline()
                # annotate the first sentence pair
                listOfAnnotations = annotateFirstSP(beforeSentSource,
                                                    duringSentSource,
                                                    beforeSentTarget,
                                                    duringSentTarget,
                                                    listOfAnnotations,
                                                    lineLength=137)
                # save the reference
                # if the filepath is the reference
                if u'burtrad' in filePath:
                    referencePathLine.append(u'{0}\t{1}'.format(filePath, 0))
                # otherwise we get it from a reference file
                else:
                    with open(u'{0}.tsv'.format(filePath)) as refFile:
                        refLns = [
                            ln.replace(u'\n', u'')
                            for ln in refFile.readlines()
                        ]
                    referencePathLine.append(refLns[0])
                # dump the first SP
                if dumpSP is True:
                    enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget
                    frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource
                    utilsOs.appendLineToFile(
                        enSent,
                        u'{0}sample.en'.format(annotatedOutputFolder),
                        addNewLine=False)
                    utilsOs.appendLineToFile(
                        frSent,
                        u'{0}sample.fr'.format(annotatedOutputFolder),
                        addNewLine=False)
                duringIndex = 1
                # for each line
                while duringSentSource or duringSentTarget:
                    # get the correct terminal line length
                    lineLength = 137 - len(str(len(listOfAnnotations) + 1))
                    # get the sentences
                    afterSentSource = fileSource.readline()
                    afterSentTarget = fileTarget.readline()
                    # color in red the during lines
                    redDuringSource = u'\033[1;31m{0}\033[0m'.format(
                        duringSentSource)
                    redDuringTarget = u'\033[1;31m{0}\033[0m'.format(
                        duringSentTarget)
                    # print the sentences
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) - 1, beforeSentSource))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) - 1, beforeSentTarget))
                    print(u'{0} - {1}'.format(len(listOfAnnotations),
                                              redDuringSource))
                    print(u'{0} - {1}'.format(len(listOfAnnotations),
                                              redDuringTarget))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) + 1, afterSentSource))
                    print(u'{0} - {1}'.format(
                        len(listOfAnnotations) + 1, afterSentTarget))
                    print()
                    # count if the lines that take the space of 2 lines
                    longLines = getNbLongLines([
                        beforeSentSource, beforeSentTarget, duringSentSource,
                        duringSentTarget, afterSentSource, afterSentTarget
                    ], lineLength)
                    # get the first part of the annotation (aligned or not)
                    annotatorGeneralInput = input(
                        u'Aligned-Misaligned annotation: ')
                    # make sure to have the right general annotation
                    while True:
                        if annotatorGeneralInput in [
                                u'0', u'1', u'0.0', u'0.1', u'0.2', u'0.3',
                                u'1.0', u'1.1', u'c', u'correct'
                        ]:
                            break
                        else:
                            utilsOs.moveUpAndLeftNLines(1, slowly=False)
                            annotatorGeneralInput = input(
                                u'Repeat annotation: ')
                    if annotatorGeneralInput in [u'c', u'correct']:
                        annotatorGeneralInput, listOfAnnotations = correctionToAnnotation(
                            listOfAnnotations)
                    # if we still need to specify what type of alignment or misalignment
                    if annotatorGeneralInput in [u'0', u'1']:
                        utilsOs.moveUpAndLeftNLines(1, slowly=False)
                        # get the second part of the annotation (aligned or not)
                        annotatorSpecificInput = input(
                            u'Specific type annotation: ')
                        typeAnswers = [
                            u'0', u'1', u'2', u'3'
                        ] if annotatorGeneralInput == 0 else [u'0', u'1']
                        # make sure to have the right specific annotation
                        while True:
                            if annotatorSpecificInput in typeAnswers:
                                break
                            else:
                                utilsOs.moveUpAndLeftNLines(1, slowly=False)
                                annotatorSpecificInput = input(
                                    u'Repeat type annotation: ')
                        # save to the list of annotations
                        listOfAnnotations.append(
                            float(u'{0}.{1}'.format(annotatorGeneralInput,
                                                    annotatorSpecificInput)))
                    # if the right answer was given in the right format right away
                    else:
                        # save to the list of annotations
                        listOfAnnotations.append(float(annotatorGeneralInput))
                    # remove the lines from the terminal before getting to the next pair
                    utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False)
                    # erase all remainder of the previous sentences and go back up again
                    for e in range(14 + longLines):
                        print(u' ' * (lineLength + 4))
                    utilsOs.moveUpAndLeftNLines(14 + longLines, slowly=False)
                    # next line source
                    beforeSentSource = duringSentSource
                    duringSentSource = afterSentSource
                    # next line target
                    beforeSentTarget = duringSentTarget
                    duringSentTarget = afterSentTarget
                    # append the reference to the file
                    # if the filepath is the reference
                    if u'burtrad' in filePath:
                        referencePathLine.append(u'{0}\t{1}'.format(
                            filePath, duringIndex))
                    # otherwise we get it from a reference file
                    else:
                        with open(u'{0}.tsv'.format(filePath)) as refFile:
                            refLns = [
                                ln.replace(u'\n', u'')
                                for ln in refFile.readlines()
                            ]
                        referencePathLine.append(refLns[duringIndex])
                    # add 1 to index
                    duringIndex += 1
                    # dump the file line by line, to be sure in case of error
                    # dump the reference
                    utilsOs.dumpRawLines(referencePathLine,
                                         u'{0}sampleReference.tsv'.format(
                                             annotatedOutputFolder),
                                         addNewline=True,
                                         rewrite=True)
                    # dump the annotation
                    utilsOs.dumpRawLines(listOfAnnotations,
                                         u'{0}sampleAnnotation.tsv'.format(
                                             annotatedOutputFolder),
                                         addNewline=True,
                                         rewrite=True)
                    # dump the SP
                    if dumpSP is True:
                        enSent = beforeSentSource if u'.en' in fileSourcePath else beforeSentTarget
                        frSent = beforeSentTarget if u'.en' in fileSourcePath else beforeSentSource
                        utilsOs.appendLineToFile(
                            enSent,
                            u'{0}sample.en'.format(annotatedOutputFolder),
                            addNewLine=False)
                        utilsOs.appendLineToFile(
                            frSent,
                            u'{0}sample.fr'.format(annotatedOutputFolder),
                            addNewLine=False)
        # clear part of terminal
        utilsOs.moveUpAndLeftNLines(2, slowly=False)
Beispiel #8
0
def launchForOneDay(tokLimit=4000,
                    outputFolderPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/",
                    coffeeBreak=1650):
    """
    launches the deepL bot for one day's worth
    :param tokLimit: maximum number of tokens to treat in the day
    :param outputFolderPath: path to the folder where will be output the files

    :param coffeeBreak: time in seconds when to take a break and start a new deppL session
    :return: tokCount: number of total tokens translated
    """
    start = utilsOs.countTime()
    # path to the referencer, indicating where we left off: path and last index worked
    referencerPath = u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"
    # info
    deepLUrl = u"https://www.deepl.com/translator"
    mUser, mPass, sUser, sPass = b000path.getDeepLProfileInfo()
    # for each user
    for user, passw in zip([sUser, mUser], [sPass, mPass]):
        tokCount = 0
        # open the driver
        session = webdriver.Firefox()
        session.get(deepLUrl)
        time.sleep(random.uniform(1.3, 3.1))
        # log to deepL
        session = authentificateBtUseSelenium(user, passw, session)
        # while we have not gone over the daily limit
        iterCount = 0
        while tokCount < (tokLimit-10):
            # get the sp
            sp, filePath, fileIndex, refLns = getANewSpWhereWeLeftOff(referencerPath)
            session, nbOfTok, enFrTranslAndAlt, frEnTranslAndAlt, timeEn, timeFr = translateSpGetResult(session, sp)
            # dump the referencer lines
            utilsOs.dumpRawLines(refLns, referencerPath, addNewline=False, rewrite=True)
            # dump original sp
            utilsOs.appendLineToFile(sp[0], u"{0}originalSent.en".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(sp[1], u"{0}originalSent.fr".format(outputFolderPath), addNewLine=True)
            # dump translation and variants
            utilsOs.appendLineToFile(enFrTranslAndAlt, u"{0}translated.en2fr".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(frEnTranslAndAlt, u"{0}translated.fr2en".format(outputFolderPath), addNewLine=True)
            # dump reference
            utilsOs.appendLineToFile(u"{0}\t{1}\n".format(filePath, fileIndex),
                                     u"{0}reference.tsv".format(outputFolderPath), addNewLine=False)
            # dump timestamp
            utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeEn, transformTimeToLocalTime(timeEn)),
                                     u"{0}timestamp.en".format(outputFolderPath), addNewLine=True)
            utilsOs.appendLineToFile(u"{0}\tlocal time: {1}".format(timeFr, transformTimeToLocalTime(timeFr)),
                                     u"{0}timestamp.fr".format(outputFolderPath), addNewLine=True)
            # add number of tokens
            tokCount += nbOfTok
            # add nb of iterations
            iterCount += 1
            # take a coffee break if it's time
            if coffeeBreak is not None and utilsOs.countTime(start) >= coffeeBreak:
                session.close()
                time.sleep(random.uniform(60, 80))
                start = utilsOs.countTime()
                # open the driver
                session = webdriver.Firefox()
                session.get(deepLUrl)
                time.sleep(random.uniform(1.3, 3.1))
                # log to deepL
                session = authentificateBtUseSelenium(user, passw, session)
            time.sleep(random.uniform(1.0, 1.5))
        # close the driver
        session.close()
        time.sleep(random.uniform(10.0, 15.0))
    return tokCount, iterCount
Beispiel #9
0
 tmxFilePaths = [
     u'{0}{1}'.format(commandPath, fn)
     for fn in tmxFileNames
 ]
 # get the metadata from the tmxFile
 for tmxData, tmxPath in zip(
         getTmxFlaggedData(tmxFilePaths), tmxFilePaths):
     # if there is no flagging data in the tmx, pass
     if tmxData is None:
         pass
     # get and dump the flagging data
     else:
         (flagType, index, enSent, frSent, segmentNb,
          flagDate) = tmxData
         utilsOs.appendLineToFile(
             enSent,
             u'{0}problematic/extracted.en'.format(
                 btExtractPath))
         utilsOs.appendLineToFile(
             frSent,
             u'{0}problematic/extracted.fr'.format(
                 btExtractPath))
         utilsOs.appendLineToFile(
             u'{0}\t{1}'.format(commandPath, index),
             u'{0}problematic/referenceDC24Corpus.tsv'.
             format(btExtractPath))
         utilsOs.appendLineToFile(
             u'{0}\t{1}\t{2}'.format(
                 flagType, segmentNb, flagDate),
             u'{0}problematic/other.tsv'.format(
                 btExtractPath))
         # map the DC24 to the archive1 files
def makeAMixOf2Annotations(inputAnnotPath1, inputAnnotPath2, outputMixPath):
    """
    Given 2 annotations, makes a third annotation made of a mix of the two others.
    :param inputAnnot1: path to the first annotation folder
    :param inputAnnot2: path to the second annotation folder
    :param outputMix: path to the mix annotation folder
    :return:
    """
    # make sure the paths end in a slash
    if inputAnnotPath1[-1] != u'/':
        inputAnnotPath1 = u'{0}/'.format(inputAnnotPath1)
    if inputAnnotPath2[-1] != u'/':
        inputAnnotPath2 = u'{0}/'.format(inputAnnotPath2)
    if outputMixPath[-1] != u'/':
        outputMixPath = u'{0}/'.format(outputMixPath)
    # for each input open
    for inPath in [inputAnnotPath1, inputAnnotPath2]:
        # open the file, read the lines
        with open(u'{0}sample.en'.format(inPath)) as inEnFile:
            enLns = inEnFile.readlines()
        with open(u'{0}sample.fr'.format(inPath)) as inFrFile:
            frLns = inFrFile.readlines()
        with open(u'{0}sampleAnnotation.tsv'.format(inPath)) as inAnnotFile:
            annotLns = inAnnotFile.readlines()
        with open(u'{0}sampleReference.tsv'.format(inPath)) as inRefFile:
            refLns = inRefFile.readlines()
        with open(u'{0}scores.tsv'.format(inPath)) as inScFile:
            scLns = inScFile.readlines()
        with open(u'{0}scoresAndMetaData.tsv'.format(inPath)) as inScMetaFile:
            scMetaLns = inScMetaFile.readlines()
        # choose and index randomly
        dejaVus = set([])
        while len(dejaVus) < int(len(enLns) / 2.0):
            randomInd = randint(0, len(enLns) - 1)
            while randomInd in dejaVus:
                randomInd = randint(0, len(enLns) - 1)
            # add to dejavus
            dejaVus.add(randomInd)
            # dump to output file
            utilsOs.appendLineToFile(enLns[randomInd],
                                     u'{0}sample.en'.format(outputMixPath),
                                     addNewLine=False)
            utilsOs.appendLineToFile(frLns[randomInd],
                                     u'{0}sample.fr'.format(outputMixPath),
                                     False)
            utilsOs.appendLineToFile(
                annotLns[randomInd],
                u'{0}sampleAnnotation.tsv'.format(outputMixPath), False)
            utilsOs.appendLineToFile(
                refLns[randomInd],
                u'{0}sampleReference.tsv'.format(outputMixPath), False)
            utilsOs.appendLineToFile(scLns[randomInd],
                                     u'{0}scores.tsv'.format(outputMixPath),
                                     False)
            utilsOs.appendLineToFile(
                scMetaLns[randomInd],
                u'{0}scoresAndMetaData.tsv'.format(outputMixPath), False)
def annotateFilesAfterHeurAndSelection(inputFolderPath,
                                       outputFolderPath,
                                       dumpSP=True):
    """ given a folder path, where the reference, en line and fr line are alreade selected, annotate the SPs """
    # add a slash if needed
    if inputFolderPath[-1] != u'/':
        inputFolderPath = u'{0}/'.format(inputFolderPath)
    if outputFolderPath[-1] != u'/':
        outputFolderPath = u'{0}/'.format(outputFolderPath)
    # get the selected reference file lines
    with open(u'{0}sampleReference.Paths'.format(
            inputFolderPath)) as refPathsFile:
        referenceLines = refPathsFile.readlines()
    # get the en and fr input lines
    with open(u'{0}sample.en'.format(inputFolderPath)) as enFile:
        enLns = enFile.readlines()
    with open(u'{0}sample.fr'.format(inputFolderPath)) as frFile:
        frLns = frFile.readlines()
    with open(u'{0}scores.tsv'.format(inputFolderPath)) as scFile:
        scLns = scFile.readlines()
    # get rid of the files we have already annotated
    if utilsOs.theFileExists(
            u'{0}sampleReference.tsv'.format(outputFolderPath)):
        # get the already seen lines
        referencePathLine = utilsOs.readAllLinesFromFile(
            u'{0}sampleReference.tsv'.format(outputFolderPath),
            noNewLineChar=True)
        listOfAnnotations = utilsOs.readAllLinesFromFile(
            u'{0}sampleAnnotation.tsv'.format(outputFolderPath),
            noNewLineChar=True)
        # maintain only what we haven't seen
        annotatedFiles = set(referencePathLine)
        newRefLines = []
        for ind, file in enumerate(referenceLines):
            if file.replace(u'\n', u'') not in annotatedFiles:
                newRefLines.append([ind, file.replace(u'\n', u'')])
        referenceLines = newRefLines
        # print(referenceLines)
    else:
        referencePathLine = []
        listOfAnnotations = []
        referenceLines = [(ind, file.replace(u'\n', u''))
                          for ind, file in enumerate(referenceLines)]
    # print the annotator cheat sheet
    printCheatSheet()
    # open each file in EN and FR and show it in the terminal
    for tupleRef in referenceLines:
        indRef, refLn = tupleRef[0], tupleRef[1]
        print(u'############# {0} ##############'.format(
            refLn.replace(u'\n', u'')))
        # get the path for the source and target
        lnsSource = enLns if u'en-fr' in refLn else frLns
        lnsTarget = frLns if u'en-fr' in refLn else enLns
        # get the correct terminal line length
        lineLength = 137 - len(str(len(listOfAnnotations) + 1))
        # color in red the during lines
        redDuringSource = u'\033[1;31m{0}\033[0m'.format(lnsSource[indRef])
        # print the sentences
        print(u'{0} - {1}'.format(len(listOfAnnotations), redDuringSource))
        print(u'{0} - {1}'.format(len(listOfAnnotations), lnsTarget[indRef]))
        print()
        # count the lines that take the space of 2 lines
        longLines = getNbLongLines([lnsSource[indRef], lnsTarget[indRef]],
                                   lineLength)
        # get the first part of the annotation (aligned or not)
        annotatorGeneralInput = input(u'Aligned-Misaligned annotation: ')
        # make sure to have the right general annotation
        while True:
            if annotatorGeneralInput in [
                    u'0', u'1', u'0.0', u'0.1', u'0.2', u'1.0', u'1.1', u'1.2',
                    u'1.3', u'1.4', u'c', u'correction'
            ]:
                break
            else:
                utilsOs.moveUpAndLeftNLines(1, slowly=False)
                annotatorGeneralInput = input(u'Repeat annotation: ')
        if annotatorGeneralInput in [u'c', u'correct']:
            annotatorGeneralInput, listOfAnnotations = correctionToAnnotation(
                listOfAnnotations)
        # save to the list of annotations
        listOfAnnotations.append(float(annotatorGeneralInput))
        # remove the lines from the terminal before getting to the next pair
        utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False)
        # erase all remainder of the previous sentences and go back up again
        for e in range(14 + longLines):
            print(u' ' * (lineLength + 4))
        utilsOs.moveUpAndLeftNLines(7 + longLines, slowly=False)
        # append the reference to the file
        referencePathLine.append(refLn)
        # dump the file line by line, to be sure in case of error
        # dump the reference
        utilsOs.dumpRawLines(
            referencePathLine,
            u'{0}sampleReference.tsv'.format(outputFolderPath),
            addNewline=True,
            rewrite=True)
        # dump the annotation
        utilsOs.dumpRawLines(
            listOfAnnotations,
            u'{0}sampleAnnotation.tsv'.format(outputFolderPath),
            addNewline=True,
            rewrite=True)
        # dump the SP
        if dumpSP is True:
            enSent = lnsSource[indRef] if u'en-fr' in refLn else lnsTarget[
                indRef]
            frSent = lnsTarget[indRef] if u'en-fr' in refLn else lnsSource[
                indRef]
            utilsOs.appendLineToFile(enSent,
                                     u'{0}sample.en'.format(outputFolderPath),
                                     addNewLine=False)
            utilsOs.appendLineToFile(frSent,
                                     u'{0}sample.fr'.format(outputFolderPath),
                                     addNewLine=False)
            utilsOs.appendLineToFile(scLns[indRef],
                                     u'{0}scores.tsv'.format(outputFolderPath),
                                     addNewLine=False)
        # clear part of terminal
        utilsOs.moveUpAndLeftNLines(7, slowly=False)