Exemple #1
0
def prepareOutPutFile(pathToFile, fileName=u"source.vcb"):
    output = pathToFile.split(u"/")
    output = u"/".join(output[:-1]) if output[-1] != u"" else u"/".join(
        output[:-2])
    output = u"{0}/GIZA/".format(output)
    utilsOs.createEmptyFolder(output)
    output = u"{0}{1}".format(output, fileName)
    return output
def makeLocalFolderPaths(listOfFilePaths):
    """ given a list of file paths, creates the equivalent in the local path """
    for filePath in listOfFilePaths:
        localFilePath = filePath.replace(
            u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/',
            u'./002manuallyAnnotated/')
        localFileList = localFilePath.split(u'/')
        folderPath = localFilePath.replace(localFileList[-1], u'')
        utilsOs.createEmptyFolder(folderPath)
Exemple #3
0
def launchTmop(inputFilePath, pharaohFilePath, tokFilePath, outputFolderPath,
               **kwargs):
    utilsOs.createEmptyFolder(outputFolderPath)
    ##########
    # get and modif the config file
    configDict = getConfigTemplate()
    configDict["options"]["input file"] = inputFilePath
    configDict["options"]["align file"] = pharaohFilePath
    configDict["options"]["token file"] = tokFilePath
    configDict["options"]["output folder"] = outputFolderPath
    ##########
    # add a policy
    configDict["policies"].append(["FourNo", "on"])
    configDict["policies"].append(["ThreeNo", "on"])
    configDict["policies"].append(["TwoNo", "on"])
    ##########
    # turn off certain heuristics to launch only the interesting ones
    # # configDict["filters"][0][1] = "off" # "SampleFilter"
    # configDict["filters"][1][1] = "off" # "LengthStats"
    # # configDict["filters"][2][1] = "off" # "LengthRatio"
    # # configDict["filters"][3][1] = "off" # "ReverseLengthRatio"
    # # configDict["filters"][4][1] = "off" # "WordRatio"
    # # configDict["filters"][5][1] = "off" # "ReverseWordRatio"
    # # configDict["filters"][6][1] = "off" # "WordLength"
    # # configDict["filters"][7][1] = "off" # "TagFinder"
    # # configDict["filters"][8][1] = "off" # "RepeatedChars"
    # # configDict["filters"][9][1] = "off" # "RepeatedWords"
    # # configDict["filters"][10][1] = "off" # "Lang_Identifier"
    # configDict["filters"][11][1] = "off" # "AlignedProportion"
    # # configDict["filters"][12][1] = "off" # "BigramAlignedProportion"
    # configDict["filters"][13][1] = "off" # "NumberOfUnalignedSequences"
    # configDict["filters"][14][1] = "off" # "LongestAlignedSequence"
    # configDict["filters"][15][1] = "off" # "LongestUnalignedSequence"
    # configDict["filters"][16][1] = "off" # "AlignedSequenceLength"
    # configDict["filters"][17][1] = "off" # "UnalignedSequenceLength"
    # configDict["filters"][18][1] = "off" # "FirstUnalignedWord"
    # configDict["filters"][19][1] = "off" # "LastUnalignedWord"
    # configDict["filters"][20][1] = "off" # "WE_Average"
    # configDict["filters"][21][1] = "off" # "WE_Median"
    # configDict["filters"][22][1] = "off" # "WE_BestAlignScore"
    # configDict["filters"][23][1] = "off" # "WE_ScoreOtherAlignment"
    # configDict["filters"][24][1] = "off" # "WE_ScoreAlign_BestForRest"
    for k, v in kwargs:
        configDict["options"][k] = v
    # dump the config.json file
    tmopFolder = "/data/rali5/Tmp/alfonsda/workRali/004tradBureau/TMOP-master"
    utilsOs.dumpDictToJsonFile(configDict,
                               "{0}/config.json".format(tmopFolder),
                               overwrite=True)
                        time.sleep(1)
                        dwnldButton = dwnldLine.find_elements(
                            By.TAG_NAME, u'a')[0]
                        time.sleep(0.8)
                        dwnldButton.click()
                        # rename the downloaded file
                        oldPathAndName = u'{0}{1}'.format(
                            tempPath, listOfNames[indexLine][0])
                        newPathAndName = u'{0}{1}'.format(
                            tempPath, listOfNames[indexLine][1])
                        try:
                            os.rename(oldPathAndName, newPathAndName)
                            # organize the file in the right folder
                            outputFolderPath = u'{0}{1}/{2}/'.format(
                                tempPath, clntName, contextDirection)
                            utilsOs.createEmptyFolder(outputFolderPath)
                            shutil.move(
                                newPathAndName,
                                u'{0}{1}'.format(outputFolderPath,
                                                 listOfNames[indexLine][1]))
                            # save it to the deja vu docs list
                            dejavuDocs.add(listOfNames[indexLine][1])
                        except FileNotFoundError:
                            pass

                ##################################################33
                # get the download links
                # englishDownloadLink = driver.find_element_by_id('context-document-en')
                # frenchDownloadLink = driver.find_element_by_id('context-document-fr')
                #
                # englishDownloadLink.click()
Exemple #5
0
def applyMgiza(mgizaMasterEnvPath, pathToEnFile, pathToFrFile, overwrite=True):
    """
    Use Mgiza++ on the bilingual files
    :param pathToEnFile:
    :param pathToFrFile:
    :param overwrite:
    :return:
    """
    # make sure the mgiza environment folder is right
    mgizaSplit = mgizaMasterEnvPath[:-1] if mgizaMasterEnvPath[
        -1] == u"/" else mgizaMasterEnvPath
    mgizaMasterEnvPath = u"{0}/".format(
        mgizaMasterEnvPath
    ) if mgizaMasterEnvPath[-1] != u"/" else mgizaMasterEnvPath
    mgizaSplit = mgizaSplit.split(u"/")
    if mgizaSplit[-1] == u"mgizapp":
        pass
    elif mgizaSplit[-1] == u"mgiza-master":
        mgizaMasterEnvPath = u"{0}mgizapp/".format(mgizaMasterEnvPath)
    # make paths to specific Mgiza tool scripts to use in terminal
    mgizaCom = u"{0}/bin/mgiza".format(mgizaMasterEnvPath)
    mkclsCom = u"{0}/bin/mkcls".format(mgizaMasterEnvPath)
    snt2coocCom = u"{0}bin/snt2cooc".format(mgizaMasterEnvPath)
    # make the vocabulary, sentence and frequency files
    vcbEnPath, vcbFrPath, sentPath, enFreqPath, frFreqPath, spFreqPath = reformatFilesPreGiza(
        pathToEnFile, pathToFrFile, overwrite)
    # generalPath = u"{0}MGIZA/".format(sentPath.replace(u"sentenceFile.giza", ""))
    generalPath = u"{0}".format(sentPath.replace(u"sentenceFile.giza", ""))
    utilsOs.createEmptyFolder(generalPath)
    # configure the nb of cpus
    subprocess.run([
        mgizaCom,
        u"{0}119-12-05.194630.alfonsda.gizacfg".format(mgizaMasterEnvPath),
        u"-ncpus", u"1"
    ])
    # make classes for hmm and ibm4 models
    classesEnPath = u"{0}{1}.classes".format(generalPath,
                                             vcbEnPath.split(u"/")[-1])
    classesFrPath = u"{0}{1}.classes".format(generalPath,
                                             vcbFrPath.split(u"/")[-1])
    subprocess.run(
        [mkclsCom, u"-p{0}".format(vcbEnPath), u"-V{0}".format(classesEnPath)])
    subprocess.run(
        [mkclsCom, u"-p{0}".format(vcbFrPath), u"-V{0}".format(classesFrPath)])
    # make the sentence coocurrence files
    coocurrencePath = u"{0}.cooc".format(generalPath, sentPath.split(u"/")[-1])
    subprocess.run(
        [snt2coocCom, coocurrencePath, vcbEnPath, vcbEnPath, sentPath])
    # run mgiza and output the files
    outputMgiza = u"{0}mgiza_output/".format(generalPath)
    utilsOs.createEmptyFolder(outputMgiza)
    outputMgiza = u"{0}{1}_{2}".format(
        outputMgiza,
        vcbEnPath.split(u"/")[-1].split(u".")[0],
        vcbFrPath.split(u"/")[-1].split(u".")[0])
    subprocess.run(
        [
            mgizaCom, u"-s", vcbEnPath, u"-t", vcbFrPath, u"-c", sentPath,
            u"-CoocurrenceFile", coocurrencePath, u"-m1", "5", u"-m2", u"5",
            u"-m3", u"5", u"-m4", u"5", u"-m5", u"5", u"-o", outputMgiza
        ]
    )  #u"-m1", "5", u"-m2", u"5", u"-m3", u"5", u"-m4", u"5", u"-m5", u"5", u"-m6", u"5", u"-mh", u"5"
    pharaohFilePath, tokFilePath = joinIntoPharaohFormat(outputMgiza)
    return pharaohFilePath, tokFilePath
def extractMisalignedSP(pathToSrcTrgtFiles,
                        extractionSize=100,
                        typeOfExtractors=[0, 1, 2]):
    """ given a path to the original source and target files, and the types of
    extractors to be used returns SP (sentence pairs) extracted as misaligned
    extractor types:
    - 0 : same number presence in src and trgt
    - 1 : 4 or less than 4 tokens
    - 2 : """
    extractedSp = {0: {}, 1: {}, 2: {}}
    totalLines = 0

    # get name of subset
    for subset in [
            u'/ALIGNMENT-QUALITY', u'/MISALIGNED', u'/NOT-FLAGGED', u'/QUALITY'
    ]:
        if subset in pathToSrcTrgtFiles:
            subsetName = subset
    # type 1 block
    output1Path = u'./003negativeNaiveExtractors/numberCoincidence/'
    utilsOs.createEmptyFolder(output1Path)
    # type 2 block
    output1Path = u'./003negativeNaiveExtractors/fewTokens/'
    utilsOs.createEmptyFolder(output1Path)
    # type 3 block
    output2Path = u'./003negativeNaiveExtractors/cognates/'
    utilsOs.createEmptyFolder(output2Path)
    # get the path to the src and trgt files
    srcTrgtFiles = utilsOs.goDeepGetFiles(pathToSrcTrgtFiles, format=u'.tmx')
    print(u'TOTAL FILES : ', len(srcTrgtFiles))
    for filePath in srcTrgtFiles:
        srcFilePath = u'{0}.en'.format(
            filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath)
        trgtFilePath = u'{0}.fr'.format(
            filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath)
        # open line by line and apply extractors
        try:
            with open(srcFilePath) as srcFile:
                with open(trgtFilePath) as trgtFile:
                    srcLines = srcFile.readlines()
                    trgtLines = trgtFile.readlines()
                    for srcLnIndex, srcLn in enumerate(srcLines):
                        trgtLn = trgtLines[srcLnIndex]
                        # tokenize
                        srcLn = srcLn.lower().replace(u' pm', u'pm')
                        trgtLn = trgtLn.lower().replace(u' pm', u'pm')
                        addSeparators = [
                            u'.', u',', u':', u'/', u'-', u"''", u"'"
                        ]
                        srcTokens = utilsString.nltkTokenizer(
                            srcLn, addSeparators)
                        trgtTokens = utilsString.nltkTokenizer(
                            trgtLn, addSeparators)
                        # apply the extractors
                        if 0 in typeOfExtractors:
                            extractedSp, score = applyExtractor(
                                nbMismatch, 0.75, srcTokens, trgtTokens,
                                extractedSp, filePath, 0, int(srcLnIndex))
                        if 1 in typeOfExtractors:
                            # get context scores and location in doc
                            cntxtScores = getContextScores(
                                srcLnIndex, srcLines, trgtLines)
                            docLoc = srcLnIndex / len(srcLines)
                            extractedSp, score = applyExtractor(
                                tableOfContents,
                                0.32,
                                srcTokens,
                                trgtTokens,
                                extractedSp,
                                filePath,
                                1,
                                int(srcLnIndex),
                                contextScores=cntxtScores,
                                placeInDocument=docLoc)
                        if 2 in typeOfExtractors:
                            extractedSp, score = applyExtractor(
                                cognateCoincidence, 0.1, srcTokens, trgtTokens,
                                extractedSp, filePath, 2, int(srcLnIndex))
                    totalLines += len(srcLines)
        # some folders have no .en and .fr to each .tmx file
        # (e.g.: '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/241-CAN_CENT_OCC_HEALTH/SAFE/en-fr/')
        except FileNotFoundError:
            pass
    print(u'TOTAL LINES : ', totalLines)
    # dump the extracted sp dict into a json file
    utilsOs.dumpDictToJsonFile(
        extractedSp,
        pathOutputFile=u'./003negativeNaiveExtractors/000extractedSp.json',
        overwrite=True)
    # randomly extract and dump the file path and the line index for the extracted SP
    randomlyExtractAndDump(extractedSp, 100, subsetName)
def applyHeuristicsOnNotFlaggedCorpus(filesIndexes,
                                      launchId,
                                      heuristicsList=None):
    """ given a corpus and heuristic indication, it applies the heuristic to that corpus and dumps the result """
    out = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/{0}/'.format(
        launchId)
    if heuristicsList is None:
        heuristicsList = [
            u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url',
            u'mono', u'tabl', u'strBcks', u'punct', u'gibb'
        ]
    starbucksExprDict, starbucksWordDict = utilsString.openEn2FrStarbucksDict()
    # make the folder
    utilsOs.createEmptyFolder(out)
    # reference file
    outputRefPath = u'{0}reference.tsv'.format(out)
    referenceAlreadyExists = utilsOs.theFileExists(outputRefPath)
    # get the list of ALL the file paths
    filePathList, subsetIndexes = getSubsetOfFiles(filesIndexes)
    # open the reference files
    with open(outputRefPath, u'a') as refFile:
        # for each tmx file
        for indexTmx, tmxFilePath in tqdm(enumerate(filePathList)):
            tmxFilePath = b000path.desAnonymizePath(tmxFilePath)
            fileNotFound = False
            # get the list of lines
            try:
                with open(u'{0}.en'.format(tmxFilePath)) as enFile:
                    enLines = enFile.readlines()
                with open(u'{0}.fr'.format(tmxFilePath)) as frFile:
                    frLines = frFile.readlines()
            except FileNotFoundError:
                print(u'FILE NOT FOUND IN : {0}'.format(tmxFilePath))
                fileNotFound = True
            if fileNotFound is False:
                # get each line
                for i in range(len(enLines)):
                    srcLn, trgtLn, enLn, frLn = getLines(
                        i, enLines, frLines, tmxFilePath)
                    # apply the heuristics
                    for heurName in heuristicsList:
                        heurFolder = u'{0}{1}/'.format(out, heurName)

                        # make the folder
                        utilsOs.createEmptyFolder(heurFolder)
                        # make the output files
                        outputScorePath = u'{0}score.tsv'.format(heurFolder)
                        # add the scores to the files
                        with open(outputScorePath, u'a') as scoreFile:
                            scoreFile.write(
                                getLnToWrite(
                                    heurName,
                                    srcLn,
                                    trgtLn,
                                    enLn,
                                    frLn,
                                    placeInDocument=float(i) /
                                    float(len(enLines)),
                                    starbucksExprDict=starbucksExprDict,
                                    starbucksWordDict=starbucksWordDict))
                    # if the reference output already exists, don't write on it
                    if referenceAlreadyExists is True:
                        pass
                    else:
                        # write the ref line
                        refFile.write(u'{0}\t{1}\n'.format(
                            b000path.anonymizePath(tmxFilePath), i))
    return None
def applyHeuristicOnCorpus(corpus=None, heuristic=None, out=None):
    """ given a corpus and heuristic indication, it applies the heuristic to that corpus and dumps the result """
    if corpus is None:
        corpus = [u'ALIGNMENT-QUALITY', u'MISALIGNED', u'QUALITY']
    if heuristic is None:
        heuristic = [
            u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url',
            u'mono', u'tabl', u'strBcks', u'punct', u'gibb'
        ]
    if out is None:
        out = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/'
    # get the heuristic needed objects
    starbucksExprDict, starbucksWordDict = utilsString.openEn2FrStarbucksDict()
    fauxAmisEn = utilsString.openFauxAmisDict(enToFr=True,
                                              withDescription=False,
                                              reducedVersion=True)
    fauxAmisFr = utilsString.openFauxAmisDict(enToFr=False,
                                              withDescription=False,
                                              reducedVersion=True)
    stopWordsEnFrDict = utilsString.openEn2FrStopWordsDict()
    enLexicon = utilsString.getWiki1000MostCommonLexicon(u'en')
    frLexicon = utilsString.getWiki1000MostCommonLexicon(u'fr')
    # get the file paths and get sure we don't take into account the file we have already seen
    filePathList = getFilePathsLists(corpus)
    # start anew by erasing the previous files for the reference and scores
    for flag in corpus:
        # make the folder
        utilsOs.createEmptyFolder(u'{0}{1}/'.format(out, flag))
        outputRefPath = u'{0}{1}/reference.tsv'.format(out, flag)
        # erase content of previous reference file
        with open(outputRefPath, u'w') as refFile:
            refFile.write(u'')
        # erase content of previous score file
        for heurName in heuristic:
            # make the folder
            utilsOs.createEmptyFolder(u'{0}{1}/{2}/'.format(
                out, flag, heurName))
            # make the output files
            outputScorePath = u'{0}{1}/{2}/score.tsv'.format(
                out, flag, heurName)
            with open(outputScorePath, u'w') as scoreFile:
                scoreFile.write(u'')
    # for each file in the list
    for tmxFilePath in tqdm(filePathList):
        flag = getFlag(tmxFilePath)
        # get the list of lines
        with open(u'{0}.en'.format(tmxFilePath)) as enFile:
            enLines = enFile.readlines()
        with open(u'{0}.fr'.format(tmxFilePath)) as frFile:
            frLines = frFile.readlines()
        # get each line
        for i in range(len(enLines)):
            srcLn, trgtLn, enLn, frLn = getLines(i, enLines, frLines,
                                                 tmxFilePath)
            outputRefPath = u'{0}{1}/reference.tsv'.format(out, flag)
            # append to the file
            with open(outputRefPath, u'a') as refFile:
                # apply the heuristics
                for heurName in heuristic:
                    # make the output files
                    outputScorePath = u'{0}{1}/{2}/score.tsv'.format(
                        out, flag, heurName)
                    # append to the score files
                    with open(outputScorePath, u'a') as scoreFile:
                        scoreFile.write(
                            getLnToWrite(heurName,
                                         srcLn,
                                         trgtLn,
                                         enLn,
                                         frLn,
                                         placeInDocument=float(i) /
                                         float(len(enLines)),
                                         starbucksExprDict=starbucksExprDict,
                                         starbucksWordDict=starbucksWordDict,
                                         fauxAmisEn=fauxAmisEn,
                                         fauxAmisFr=fauxAmisFr,
                                         stopWordsEnFrDict=stopWordsEnFrDict,
                                         enLex=enLexicon,
                                         frLex=frLexicon))
                # dump to ref file
                refFile.write(u'{0}\t{1}\n'.format(
                    b000path.anonymizePath(tmxFilePath), i))
    return None