def randomlySelectNDocsFromPath(folderPath, n=100): """ given a folder path, return a list of n randomly selected file paths """ dejaVus = set() randomSelected = set() # get all the tmx files in the folder wholeFolderContent = utilsOs.goDeepGetFiles(folderPath, format=u'.tmx') # if there are less files in the folder path as in n then return them all if len(wholeFolderContent) <= n: return wholeFolderContent # get n randomly selected files from the whole for e in range(n): index = getRandomIntNeverseenBefore(len(wholeFolderContent), dejaVus) # add to dejavus and to the random selected list dejaVus.add(index) randomSelected.add(wholeFolderContent[index]) # get the domain if folderPath[-1] == u'/': domain = folderPath[:-1].split(u'/')[-1] elif u'.' in folderPath.split(u'/')[-1]: path = folderPath.replace(u'/{0}'.format(folderPath.split(u'/')[-1]), u'') domain = path.split(u'/')[-1] else: domain = folderPath.split(u'/')[-1] # dump the set utilsOs.dumpDictToJsonFile( list(randomSelected), pathOutputFile='./randomSelected{0}{1}.json'.format(n, domain), overwrite=True) return randomSelected
def changeStructure(): annotationFiles = utilsOs.goDeepGetFiles( u'./002manuallyAnnotated/oldOnes/MISALIGNED/', format=u'.tmx') for annotationPath in annotationFiles: origPath = u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/' + annotationPath.split( u'MISALIGNED/')[-1] srcPath = origPath + u'.en' trgtPath = origPath + u'.fr' with open(annotationPath) as file: fileLines = file.readlines() with open(srcPath) as src: srcLines = src.readlines() with open(trgtPath) as trgt: trgtLines = trgt.readlines() for i, anot in enumerate(fileLines): srcLn = srcLines[i] tgrtLn = trgtLines[i] # dump the reference referencePathLine = u'{0}\t{1}\n'.format(origPath, i) utilsOs.appendLineToFile( referencePathLine, u'./002manuallyAnnotated/sampleReference.tsv', addNewLine=False) # dump the annotation utilsOs.appendLineToFile( anot, u'./002manuallyAnnotated/sampleAnnotation.tsv', addNewLine=False) # dump the SP utilsOs.appendLineToFile(srcLn, u'./002manuallyAnnotated/sampleEn.tsv', addNewLine=False) utilsOs.appendLineToFile(tgrtLn, u'./002manuallyAnnotated/sampleFr.tsv', addNewLine=False)
def getPathWhereWeFind(stringToBeFound, verbose=True): srcTrgtFiles = utilsOs.goDeepGetFiles( b000path.getBtFolderPath(flagFolder=u'a'), format=u'.tmx') srcLinesContaining = set() pathsContaining = set() for filePath in srcTrgtFiles: srcFilePath = u'{0}.en'.format( filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath) trgtFilePath = u'{0}.fr'.format( filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath) # open line by line and apply extractors try: with open(srcFilePath) as srcFile: srcLines = srcFile.readlines() with open(trgtFilePath) as trgtFile: trgtLines = trgtFile.readlines() for srcLnIndex, srcLn in enumerate(srcLines): trgtLn = trgtLines[srcLnIndex] if stringToBeFound in srcLn or stringToBeFound in trgtLn: if verbose == True: print(filePath) srcLinesContaining.add(srcLn) pathsContaining.add(filePath) except FileNotFoundError: pass print(len(srcLinesContaining), len(pathsContaining))
def makeARefFile(rootFolder=u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/", refFilePath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"): # make sure the files does not yet exists if utilsOs.theFileExists(refFilePath) is True: return None utilsOs.createEmptyFile(refFilePath) listOfFiles = utilsOs.goDeepGetFiles(u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/", format=u".tmx.en") with open(refFilePath, u"a") as refFile: for filePath in listOfFiles: refFile.write(u"{0}\t-1\n".format(filePath.replace(u".tmx.en", u".tmx")))
def extractMisalignedSP(pathToSrcTrgtFiles, extractionSize=100, typeOfExtractors=[0, 1, 2]): """ given a path to the original source and target files, and the types of extractors to be used returns SP (sentence pairs) extracted as misaligned extractor types: - 0 : same number presence in src and trgt - 1 : 4 or less than 4 tokens - 2 : """ extractedSp = {0: {}, 1: {}, 2: {}} totalLines = 0 # get name of subset for subset in [ u'/ALIGNMENT-QUALITY', u'/MISALIGNED', u'/NOT-FLAGGED', u'/QUALITY' ]: if subset in pathToSrcTrgtFiles: subsetName = subset # type 1 block output1Path = u'./003negativeNaiveExtractors/numberCoincidence/' utilsOs.createEmptyFolder(output1Path) # type 2 block output1Path = u'./003negativeNaiveExtractors/fewTokens/' utilsOs.createEmptyFolder(output1Path) # type 3 block output2Path = u'./003negativeNaiveExtractors/cognates/' utilsOs.createEmptyFolder(output2Path) # get the path to the src and trgt files srcTrgtFiles = utilsOs.goDeepGetFiles(pathToSrcTrgtFiles, format=u'.tmx') print(u'TOTAL FILES : ', len(srcTrgtFiles)) for filePath in srcTrgtFiles: srcFilePath = u'{0}.en'.format( filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath) trgtFilePath = u'{0}.fr'.format( filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath) # open line by line and apply extractors try: with open(srcFilePath) as srcFile: with open(trgtFilePath) as trgtFile: srcLines = srcFile.readlines() trgtLines = trgtFile.readlines() for srcLnIndex, srcLn in enumerate(srcLines): trgtLn = trgtLines[srcLnIndex] # tokenize srcLn = srcLn.lower().replace(u' pm', u'pm') trgtLn = trgtLn.lower().replace(u' pm', u'pm') addSeparators = [ u'.', u',', u':', u'/', u'-', u"''", u"'" ] srcTokens = utilsString.nltkTokenizer( srcLn, addSeparators) trgtTokens = utilsString.nltkTokenizer( trgtLn, addSeparators) # apply the extractors if 0 in typeOfExtractors: extractedSp, score = applyExtractor( nbMismatch, 0.75, srcTokens, trgtTokens, extractedSp, filePath, 0, int(srcLnIndex)) if 1 in typeOfExtractors: # get context scores and location in doc cntxtScores = getContextScores( srcLnIndex, srcLines, trgtLines) docLoc = srcLnIndex / len(srcLines) extractedSp, score = applyExtractor( tableOfContents, 0.32, srcTokens, trgtTokens, extractedSp, filePath, 1, int(srcLnIndex), contextScores=cntxtScores, placeInDocument=docLoc) if 2 in typeOfExtractors: extractedSp, score = applyExtractor( cognateCoincidence, 0.1, srcTokens, trgtTokens, extractedSp, filePath, 2, int(srcLnIndex)) totalLines += len(srcLines) # some folders have no .en and .fr to each .tmx file # (e.g.: '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/241-CAN_CENT_OCC_HEALTH/SAFE/en-fr/') except FileNotFoundError: pass print(u'TOTAL LINES : ', totalLines) # dump the extracted sp dict into a json file utilsOs.dumpDictToJsonFile( extractedSp, pathOutputFile=u'./003negativeNaiveExtractors/000extractedSp.json', overwrite=True) # randomly extract and dump the file path and the line index for the extracted SP randomlyExtractAndDump(extractedSp, 100, subsetName)