def randomlySelectNDocsFromPath(folderPath, n=100):
    """ given a folder path, return a list of n randomly selected file paths """
    dejaVus = set()
    randomSelected = set()
    # get all the tmx files in the folder
    wholeFolderContent = utilsOs.goDeepGetFiles(folderPath, format=u'.tmx')
    # if there are less files in the folder path as in n then return them all
    if len(wholeFolderContent) <= n:
        return wholeFolderContent
    # get n randomly selected files from the whole
    for e in range(n):
        index = getRandomIntNeverseenBefore(len(wholeFolderContent), dejaVus)
        # add to dejavus and to the random selected list
        dejaVus.add(index)
        randomSelected.add(wholeFolderContent[index])
    # get the domain
    if folderPath[-1] == u'/':
        domain = folderPath[:-1].split(u'/')[-1]
    elif u'.' in folderPath.split(u'/')[-1]:
        path = folderPath.replace(u'/{0}'.format(folderPath.split(u'/')[-1]),
                                  u'')
        domain = path.split(u'/')[-1]
    else:
        domain = folderPath.split(u'/')[-1]
    # dump the set
    utilsOs.dumpDictToJsonFile(
        list(randomSelected),
        pathOutputFile='./randomSelected{0}{1}.json'.format(n, domain),
        overwrite=True)
    return randomSelected
def changeStructure():
    annotationFiles = utilsOs.goDeepGetFiles(
        u'./002manuallyAnnotated/oldOnes/MISALIGNED/', format=u'.tmx')
    for annotationPath in annotationFiles:
        origPath = u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/' + annotationPath.split(
            u'MISALIGNED/')[-1]
        srcPath = origPath + u'.en'
        trgtPath = origPath + u'.fr'
        with open(annotationPath) as file:
            fileLines = file.readlines()
        with open(srcPath) as src:
            srcLines = src.readlines()
        with open(trgtPath) as trgt:
            trgtLines = trgt.readlines()
        for i, anot in enumerate(fileLines):
            srcLn = srcLines[i]
            tgrtLn = trgtLines[i]
            # dump the reference
            referencePathLine = u'{0}\t{1}\n'.format(origPath, i)
            utilsOs.appendLineToFile(
                referencePathLine,
                u'./002manuallyAnnotated/sampleReference.tsv',
                addNewLine=False)
            # dump the annotation
            utilsOs.appendLineToFile(
                anot,
                u'./002manuallyAnnotated/sampleAnnotation.tsv',
                addNewLine=False)
            # dump the SP
            utilsOs.appendLineToFile(srcLn,
                                     u'./002manuallyAnnotated/sampleEn.tsv',
                                     addNewLine=False)
            utilsOs.appendLineToFile(tgrtLn,
                                     u'./002manuallyAnnotated/sampleFr.tsv',
                                     addNewLine=False)
def getPathWhereWeFind(stringToBeFound, verbose=True):
    srcTrgtFiles = utilsOs.goDeepGetFiles(
        b000path.getBtFolderPath(flagFolder=u'a'), format=u'.tmx')
    srcLinesContaining = set()
    pathsContaining = set()
    for filePath in srcTrgtFiles:
        srcFilePath = u'{0}.en'.format(
            filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath)
        trgtFilePath = u'{0}.fr'.format(
            filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath)
        # open line by line and apply extractors
        try:
            with open(srcFilePath) as srcFile:
                srcLines = srcFile.readlines()
            with open(trgtFilePath) as trgtFile:
                trgtLines = trgtFile.readlines()
            for srcLnIndex, srcLn in enumerate(srcLines):
                trgtLn = trgtLines[srcLnIndex]
                if stringToBeFound in srcLn or stringToBeFound in trgtLn:
                    if verbose == True:
                        print(filePath)
                    srcLinesContaining.add(srcLn)
                    pathsContaining.add(filePath)
        except FileNotFoundError:
            pass
    print(len(srcLinesContaining), len(pathsContaining))
Beispiel #4
0
def makeARefFile(rootFolder=u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/",
                 refFilePath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"):
    # make sure the files does not yet exists
    if utilsOs.theFileExists(refFilePath) is True:
        return None
    utilsOs.createEmptyFile(refFilePath)
    listOfFiles = utilsOs.goDeepGetFiles(u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/", format=u".tmx.en")
    with open(refFilePath, u"a") as refFile:
        for filePath in listOfFiles:
            refFile.write(u"{0}\t-1\n".format(filePath.replace(u".tmx.en", u".tmx")))
def extractMisalignedSP(pathToSrcTrgtFiles,
                        extractionSize=100,
                        typeOfExtractors=[0, 1, 2]):
    """ given a path to the original source and target files, and the types of
    extractors to be used returns SP (sentence pairs) extracted as misaligned
    extractor types:
    - 0 : same number presence in src and trgt
    - 1 : 4 or less than 4 tokens
    - 2 : """
    extractedSp = {0: {}, 1: {}, 2: {}}
    totalLines = 0

    # get name of subset
    for subset in [
            u'/ALIGNMENT-QUALITY', u'/MISALIGNED', u'/NOT-FLAGGED', u'/QUALITY'
    ]:
        if subset in pathToSrcTrgtFiles:
            subsetName = subset
    # type 1 block
    output1Path = u'./003negativeNaiveExtractors/numberCoincidence/'
    utilsOs.createEmptyFolder(output1Path)
    # type 2 block
    output1Path = u'./003negativeNaiveExtractors/fewTokens/'
    utilsOs.createEmptyFolder(output1Path)
    # type 3 block
    output2Path = u'./003negativeNaiveExtractors/cognates/'
    utilsOs.createEmptyFolder(output2Path)
    # get the path to the src and trgt files
    srcTrgtFiles = utilsOs.goDeepGetFiles(pathToSrcTrgtFiles, format=u'.tmx')
    print(u'TOTAL FILES : ', len(srcTrgtFiles))
    for filePath in srcTrgtFiles:
        srcFilePath = u'{0}.en'.format(
            filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath)
        trgtFilePath = u'{0}.fr'.format(
            filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath)
        # open line by line and apply extractors
        try:
            with open(srcFilePath) as srcFile:
                with open(trgtFilePath) as trgtFile:
                    srcLines = srcFile.readlines()
                    trgtLines = trgtFile.readlines()
                    for srcLnIndex, srcLn in enumerate(srcLines):
                        trgtLn = trgtLines[srcLnIndex]
                        # tokenize
                        srcLn = srcLn.lower().replace(u' pm', u'pm')
                        trgtLn = trgtLn.lower().replace(u' pm', u'pm')
                        addSeparators = [
                            u'.', u',', u':', u'/', u'-', u"''", u"'"
                        ]
                        srcTokens = utilsString.nltkTokenizer(
                            srcLn, addSeparators)
                        trgtTokens = utilsString.nltkTokenizer(
                            trgtLn, addSeparators)
                        # apply the extractors
                        if 0 in typeOfExtractors:
                            extractedSp, score = applyExtractor(
                                nbMismatch, 0.75, srcTokens, trgtTokens,
                                extractedSp, filePath, 0, int(srcLnIndex))
                        if 1 in typeOfExtractors:
                            # get context scores and location in doc
                            cntxtScores = getContextScores(
                                srcLnIndex, srcLines, trgtLines)
                            docLoc = srcLnIndex / len(srcLines)
                            extractedSp, score = applyExtractor(
                                tableOfContents,
                                0.32,
                                srcTokens,
                                trgtTokens,
                                extractedSp,
                                filePath,
                                1,
                                int(srcLnIndex),
                                contextScores=cntxtScores,
                                placeInDocument=docLoc)
                        if 2 in typeOfExtractors:
                            extractedSp, score = applyExtractor(
                                cognateCoincidence, 0.1, srcTokens, trgtTokens,
                                extractedSp, filePath, 2, int(srcLnIndex))
                    totalLines += len(srcLines)
        # some folders have no .en and .fr to each .tmx file
        # (e.g.: '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/241-CAN_CENT_OCC_HEALTH/SAFE/en-fr/')
        except FileNotFoundError:
            pass
    print(u'TOTAL LINES : ', totalLines)
    # dump the extracted sp dict into a json file
    utilsOs.dumpDictToJsonFile(
        extractedSp,
        pathOutputFile=u'./003negativeNaiveExtractors/000extractedSp.json',
        overwrite=True)
    # randomly extract and dump the file path and the line index for the extracted SP
    randomlyExtractAndDump(extractedSp, 100, subsetName)