def prepareOutPutFile(pathToFile, fileName=u"source.vcb"): output = pathToFile.split(u"/") output = u"/".join(output[:-1]) if output[-1] != u"" else u"/".join( output[:-2]) output = u"{0}/GIZA/".format(output) utilsOs.createEmptyFolder(output) output = u"{0}{1}".format(output, fileName) return output
def makeLocalFolderPaths(listOfFilePaths): """ given a list of file paths, creates the equivalent in the local path """ for filePath in listOfFilePaths: localFilePath = filePath.replace( u'/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/', u'./002manuallyAnnotated/') localFileList = localFilePath.split(u'/') folderPath = localFilePath.replace(localFileList[-1], u'') utilsOs.createEmptyFolder(folderPath)
def launchTmop(inputFilePath, pharaohFilePath, tokFilePath, outputFolderPath, **kwargs): utilsOs.createEmptyFolder(outputFolderPath) ########## # get and modif the config file configDict = getConfigTemplate() configDict["options"]["input file"] = inputFilePath configDict["options"]["align file"] = pharaohFilePath configDict["options"]["token file"] = tokFilePath configDict["options"]["output folder"] = outputFolderPath ########## # add a policy configDict["policies"].append(["FourNo", "on"]) configDict["policies"].append(["ThreeNo", "on"]) configDict["policies"].append(["TwoNo", "on"]) ########## # turn off certain heuristics to launch only the interesting ones # # configDict["filters"][0][1] = "off" # "SampleFilter" # configDict["filters"][1][1] = "off" # "LengthStats" # # configDict["filters"][2][1] = "off" # "LengthRatio" # # configDict["filters"][3][1] = "off" # "ReverseLengthRatio" # # configDict["filters"][4][1] = "off" # "WordRatio" # # configDict["filters"][5][1] = "off" # "ReverseWordRatio" # # configDict["filters"][6][1] = "off" # "WordLength" # # configDict["filters"][7][1] = "off" # "TagFinder" # # configDict["filters"][8][1] = "off" # "RepeatedChars" # # configDict["filters"][9][1] = "off" # "RepeatedWords" # # configDict["filters"][10][1] = "off" # "Lang_Identifier" # configDict["filters"][11][1] = "off" # "AlignedProportion" # # configDict["filters"][12][1] = "off" # "BigramAlignedProportion" # configDict["filters"][13][1] = "off" # "NumberOfUnalignedSequences" # configDict["filters"][14][1] = "off" # "LongestAlignedSequence" # configDict["filters"][15][1] = "off" # "LongestUnalignedSequence" # configDict["filters"][16][1] = "off" # "AlignedSequenceLength" # configDict["filters"][17][1] = "off" # "UnalignedSequenceLength" # configDict["filters"][18][1] = "off" # "FirstUnalignedWord" # configDict["filters"][19][1] = "off" # "LastUnalignedWord" # configDict["filters"][20][1] = "off" # "WE_Average" # configDict["filters"][21][1] = "off" # "WE_Median" # configDict["filters"][22][1] = "off" # "WE_BestAlignScore" # configDict["filters"][23][1] = "off" # "WE_ScoreOtherAlignment" # configDict["filters"][24][1] = "off" # "WE_ScoreAlign_BestForRest" for k, v in kwargs: configDict["options"][k] = v # dump the config.json file tmopFolder = "/data/rali5/Tmp/alfonsda/workRali/004tradBureau/TMOP-master" utilsOs.dumpDictToJsonFile(configDict, "{0}/config.json".format(tmopFolder), overwrite=True)
time.sleep(1) dwnldButton = dwnldLine.find_elements( By.TAG_NAME, u'a')[0] time.sleep(0.8) dwnldButton.click() # rename the downloaded file oldPathAndName = u'{0}{1}'.format( tempPath, listOfNames[indexLine][0]) newPathAndName = u'{0}{1}'.format( tempPath, listOfNames[indexLine][1]) try: os.rename(oldPathAndName, newPathAndName) # organize the file in the right folder outputFolderPath = u'{0}{1}/{2}/'.format( tempPath, clntName, contextDirection) utilsOs.createEmptyFolder(outputFolderPath) shutil.move( newPathAndName, u'{0}{1}'.format(outputFolderPath, listOfNames[indexLine][1])) # save it to the deja vu docs list dejavuDocs.add(listOfNames[indexLine][1]) except FileNotFoundError: pass ##################################################33 # get the download links # englishDownloadLink = driver.find_element_by_id('context-document-en') # frenchDownloadLink = driver.find_element_by_id('context-document-fr') # # englishDownloadLink.click()
def applyMgiza(mgizaMasterEnvPath, pathToEnFile, pathToFrFile, overwrite=True): """ Use Mgiza++ on the bilingual files :param pathToEnFile: :param pathToFrFile: :param overwrite: :return: """ # make sure the mgiza environment folder is right mgizaSplit = mgizaMasterEnvPath[:-1] if mgizaMasterEnvPath[ -1] == u"/" else mgizaMasterEnvPath mgizaMasterEnvPath = u"{0}/".format( mgizaMasterEnvPath ) if mgizaMasterEnvPath[-1] != u"/" else mgizaMasterEnvPath mgizaSplit = mgizaSplit.split(u"/") if mgizaSplit[-1] == u"mgizapp": pass elif mgizaSplit[-1] == u"mgiza-master": mgizaMasterEnvPath = u"{0}mgizapp/".format(mgizaMasterEnvPath) # make paths to specific Mgiza tool scripts to use in terminal mgizaCom = u"{0}/bin/mgiza".format(mgizaMasterEnvPath) mkclsCom = u"{0}/bin/mkcls".format(mgizaMasterEnvPath) snt2coocCom = u"{0}bin/snt2cooc".format(mgizaMasterEnvPath) # make the vocabulary, sentence and frequency files vcbEnPath, vcbFrPath, sentPath, enFreqPath, frFreqPath, spFreqPath = reformatFilesPreGiza( pathToEnFile, pathToFrFile, overwrite) # generalPath = u"{0}MGIZA/".format(sentPath.replace(u"sentenceFile.giza", "")) generalPath = u"{0}".format(sentPath.replace(u"sentenceFile.giza", "")) utilsOs.createEmptyFolder(generalPath) # configure the nb of cpus subprocess.run([ mgizaCom, u"{0}119-12-05.194630.alfonsda.gizacfg".format(mgizaMasterEnvPath), u"-ncpus", u"1" ]) # make classes for hmm and ibm4 models classesEnPath = u"{0}{1}.classes".format(generalPath, vcbEnPath.split(u"/")[-1]) classesFrPath = u"{0}{1}.classes".format(generalPath, vcbFrPath.split(u"/")[-1]) subprocess.run( [mkclsCom, u"-p{0}".format(vcbEnPath), u"-V{0}".format(classesEnPath)]) subprocess.run( [mkclsCom, u"-p{0}".format(vcbFrPath), u"-V{0}".format(classesFrPath)]) # make the sentence coocurrence files coocurrencePath = u"{0}.cooc".format(generalPath, sentPath.split(u"/")[-1]) subprocess.run( [snt2coocCom, coocurrencePath, vcbEnPath, vcbEnPath, sentPath]) # run mgiza and output the files outputMgiza = u"{0}mgiza_output/".format(generalPath) utilsOs.createEmptyFolder(outputMgiza) outputMgiza = u"{0}{1}_{2}".format( outputMgiza, vcbEnPath.split(u"/")[-1].split(u".")[0], vcbFrPath.split(u"/")[-1].split(u".")[0]) subprocess.run( [ mgizaCom, u"-s", vcbEnPath, u"-t", vcbFrPath, u"-c", sentPath, u"-CoocurrenceFile", coocurrencePath, u"-m1", "5", u"-m2", u"5", u"-m3", u"5", u"-m4", u"5", u"-m5", u"5", u"-o", outputMgiza ] ) #u"-m1", "5", u"-m2", u"5", u"-m3", u"5", u"-m4", u"5", u"-m5", u"5", u"-m6", u"5", u"-mh", u"5" pharaohFilePath, tokFilePath = joinIntoPharaohFormat(outputMgiza) return pharaohFilePath, tokFilePath
def extractMisalignedSP(pathToSrcTrgtFiles, extractionSize=100, typeOfExtractors=[0, 1, 2]): """ given a path to the original source and target files, and the types of extractors to be used returns SP (sentence pairs) extracted as misaligned extractor types: - 0 : same number presence in src and trgt - 1 : 4 or less than 4 tokens - 2 : """ extractedSp = {0: {}, 1: {}, 2: {}} totalLines = 0 # get name of subset for subset in [ u'/ALIGNMENT-QUALITY', u'/MISALIGNED', u'/NOT-FLAGGED', u'/QUALITY' ]: if subset in pathToSrcTrgtFiles: subsetName = subset # type 1 block output1Path = u'./003negativeNaiveExtractors/numberCoincidence/' utilsOs.createEmptyFolder(output1Path) # type 2 block output1Path = u'./003negativeNaiveExtractors/fewTokens/' utilsOs.createEmptyFolder(output1Path) # type 3 block output2Path = u'./003negativeNaiveExtractors/cognates/' utilsOs.createEmptyFolder(output2Path) # get the path to the src and trgt files srcTrgtFiles = utilsOs.goDeepGetFiles(pathToSrcTrgtFiles, format=u'.tmx') print(u'TOTAL FILES : ', len(srcTrgtFiles)) for filePath in srcTrgtFiles: srcFilePath = u'{0}.en'.format( filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath) trgtFilePath = u'{0}.fr'.format( filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath) # open line by line and apply extractors try: with open(srcFilePath) as srcFile: with open(trgtFilePath) as trgtFile: srcLines = srcFile.readlines() trgtLines = trgtFile.readlines() for srcLnIndex, srcLn in enumerate(srcLines): trgtLn = trgtLines[srcLnIndex] # tokenize srcLn = srcLn.lower().replace(u' pm', u'pm') trgtLn = trgtLn.lower().replace(u' pm', u'pm') addSeparators = [ u'.', u',', u':', u'/', u'-', u"''", u"'" ] srcTokens = utilsString.nltkTokenizer( srcLn, addSeparators) trgtTokens = utilsString.nltkTokenizer( trgtLn, addSeparators) # apply the extractors if 0 in typeOfExtractors: extractedSp, score = applyExtractor( nbMismatch, 0.75, srcTokens, trgtTokens, extractedSp, filePath, 0, int(srcLnIndex)) if 1 in typeOfExtractors: # get context scores and location in doc cntxtScores = getContextScores( srcLnIndex, srcLines, trgtLines) docLoc = srcLnIndex / len(srcLines) extractedSp, score = applyExtractor( tableOfContents, 0.32, srcTokens, trgtTokens, extractedSp, filePath, 1, int(srcLnIndex), contextScores=cntxtScores, placeInDocument=docLoc) if 2 in typeOfExtractors: extractedSp, score = applyExtractor( cognateCoincidence, 0.1, srcTokens, trgtTokens, extractedSp, filePath, 2, int(srcLnIndex)) totalLines += len(srcLines) # some folders have no .en and .fr to each .tmx file # (e.g.: '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/241-CAN_CENT_OCC_HEALTH/SAFE/en-fr/') except FileNotFoundError: pass print(u'TOTAL LINES : ', totalLines) # dump the extracted sp dict into a json file utilsOs.dumpDictToJsonFile( extractedSp, pathOutputFile=u'./003negativeNaiveExtractors/000extractedSp.json', overwrite=True) # randomly extract and dump the file path and the line index for the extracted SP randomlyExtractAndDump(extractedSp, 100, subsetName)
def applyHeuristicsOnNotFlaggedCorpus(filesIndexes, launchId, heuristicsList=None): """ given a corpus and heuristic indication, it applies the heuristic to that corpus and dumps the result """ out = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/{0}/'.format( launchId) if heuristicsList is None: heuristicsList = [ u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url', u'mono', u'tabl', u'strBcks', u'punct', u'gibb' ] starbucksExprDict, starbucksWordDict = utilsString.openEn2FrStarbucksDict() # make the folder utilsOs.createEmptyFolder(out) # reference file outputRefPath = u'{0}reference.tsv'.format(out) referenceAlreadyExists = utilsOs.theFileExists(outputRefPath) # get the list of ALL the file paths filePathList, subsetIndexes = getSubsetOfFiles(filesIndexes) # open the reference files with open(outputRefPath, u'a') as refFile: # for each tmx file for indexTmx, tmxFilePath in tqdm(enumerate(filePathList)): tmxFilePath = b000path.desAnonymizePath(tmxFilePath) fileNotFound = False # get the list of lines try: with open(u'{0}.en'.format(tmxFilePath)) as enFile: enLines = enFile.readlines() with open(u'{0}.fr'.format(tmxFilePath)) as frFile: frLines = frFile.readlines() except FileNotFoundError: print(u'FILE NOT FOUND IN : {0}'.format(tmxFilePath)) fileNotFound = True if fileNotFound is False: # get each line for i in range(len(enLines)): srcLn, trgtLn, enLn, frLn = getLines( i, enLines, frLines, tmxFilePath) # apply the heuristics for heurName in heuristicsList: heurFolder = u'{0}{1}/'.format(out, heurName) # make the folder utilsOs.createEmptyFolder(heurFolder) # make the output files outputScorePath = u'{0}score.tsv'.format(heurFolder) # add the scores to the files with open(outputScorePath, u'a') as scoreFile: scoreFile.write( getLnToWrite( heurName, srcLn, trgtLn, enLn, frLn, placeInDocument=float(i) / float(len(enLines)), starbucksExprDict=starbucksExprDict, starbucksWordDict=starbucksWordDict)) # if the reference output already exists, don't write on it if referenceAlreadyExists is True: pass else: # write the ref line refFile.write(u'{0}\t{1}\n'.format( b000path.anonymizePath(tmxFilePath), i)) return None
def applyHeuristicOnCorpus(corpus=None, heuristic=None, out=None): """ given a corpus and heuristic indication, it applies the heuristic to that corpus and dumps the result """ if corpus is None: corpus = [u'ALIGNMENT-QUALITY', u'MISALIGNED', u'QUALITY'] if heuristic is None: heuristic = [ u'nb', u'cog', u'len', u'fa', u'ion', u'sw', u'spell', u'url', u'mono', u'tabl', u'strBcks', u'punct', u'gibb' ] if out is None: out = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/' # get the heuristic needed objects starbucksExprDict, starbucksWordDict = utilsString.openEn2FrStarbucksDict() fauxAmisEn = utilsString.openFauxAmisDict(enToFr=True, withDescription=False, reducedVersion=True) fauxAmisFr = utilsString.openFauxAmisDict(enToFr=False, withDescription=False, reducedVersion=True) stopWordsEnFrDict = utilsString.openEn2FrStopWordsDict() enLexicon = utilsString.getWiki1000MostCommonLexicon(u'en') frLexicon = utilsString.getWiki1000MostCommonLexicon(u'fr') # get the file paths and get sure we don't take into account the file we have already seen filePathList = getFilePathsLists(corpus) # start anew by erasing the previous files for the reference and scores for flag in corpus: # make the folder utilsOs.createEmptyFolder(u'{0}{1}/'.format(out, flag)) outputRefPath = u'{0}{1}/reference.tsv'.format(out, flag) # erase content of previous reference file with open(outputRefPath, u'w') as refFile: refFile.write(u'') # erase content of previous score file for heurName in heuristic: # make the folder utilsOs.createEmptyFolder(u'{0}{1}/{2}/'.format( out, flag, heurName)) # make the output files outputScorePath = u'{0}{1}/{2}/score.tsv'.format( out, flag, heurName) with open(outputScorePath, u'w') as scoreFile: scoreFile.write(u'') # for each file in the list for tmxFilePath in tqdm(filePathList): flag = getFlag(tmxFilePath) # get the list of lines with open(u'{0}.en'.format(tmxFilePath)) as enFile: enLines = enFile.readlines() with open(u'{0}.fr'.format(tmxFilePath)) as frFile: frLines = frFile.readlines() # get each line for i in range(len(enLines)): srcLn, trgtLn, enLn, frLn = getLines(i, enLines, frLines, tmxFilePath) outputRefPath = u'{0}{1}/reference.tsv'.format(out, flag) # append to the file with open(outputRefPath, u'a') as refFile: # apply the heuristics for heurName in heuristic: # make the output files outputScorePath = u'{0}{1}/{2}/score.tsv'.format( out, flag, heurName) # append to the score files with open(outputScorePath, u'a') as scoreFile: scoreFile.write( getLnToWrite(heurName, srcLn, trgtLn, enLn, frLn, placeInDocument=float(i) / float(len(enLines)), starbucksExprDict=starbucksExprDict, starbucksWordDict=starbucksWordDict, fauxAmisEn=fauxAmisEn, fauxAmisFr=fauxAmisFr, stopWordsEnFrDict=stopWordsEnFrDict, enLex=enLexicon, frLex=frLexicon)) # dump to ref file refFile.write(u'{0}\t{1}\n'.format( b000path.anonymizePath(tmxFilePath), i)) return None