def randomlySelectNDocsFromPath(folderPath, n=100): """ given a folder path, return a list of n randomly selected file paths """ dejaVus = set() randomSelected = set() # get all the tmx files in the folder wholeFolderContent = utilsOs.goDeepGetFiles(folderPath, format=u'.tmx') # if there are less files in the folder path as in n then return them all if len(wholeFolderContent) <= n: return wholeFolderContent # get n randomly selected files from the whole for e in range(n): index = getRandomIntNeverseenBefore(len(wholeFolderContent), dejaVus) # add to dejavus and to the random selected list dejaVus.add(index) randomSelected.add(wholeFolderContent[index]) # get the domain if folderPath[-1] == u'/': domain = folderPath[:-1].split(u'/')[-1] elif u'.' in folderPath.split(u'/')[-1]: path = folderPath.replace(u'/{0}'.format(folderPath.split(u'/')[-1]), u'') domain = path.split(u'/')[-1] else: domain = folderPath.split(u'/')[-1] # dump the set utilsOs.dumpDictToJsonFile( list(randomSelected), pathOutputFile='./randomSelected{0}{1}.json'.format(n, domain), overwrite=True) return randomSelected
def makeSampleFileHavingNJobTitles(pathInput, pathOutput, n=1000000, addJobDescription=False): ''' takes the real linkedIn data and makes a sample containing how many profiles necesary to achieve N functions (jobtitles) ''' dictJobTitlesData = {} #sample of all candidates data outputJson = utilsOs.createEmptyFile(u'%ssample.json' %(pathOutput)) #we read the original json file line by line with codecs.open(pathInput, u'r', encoding=u'utf8') as jsonFile: while len(dictJobTitlesData) < n: jsonData = jsonFile.readline() #we dump each line into the sample file outputJson.write(jsonData.replace(u'\r', '')) #we make a dict out of the string line jsonLine = utilsOs.convertJsonLineToDict(jsonData) if jsonLine != None: #we dump each job title into the jobtitle file dictJobTitlesData = getJobData(jsonLine, dictJobTitlesData) #dumping dict content in json utilsOs.dumpDictToJsonFile(dictJobTitlesData, pathOutputFile=u'%sjobTitlesDataDict.json'%(pathOutput)) #SIMPLIFIED DATA dumping job title (and optional dexcription) to a file dumpJobTitleAndDescription(dictJobTitlesData, u'%sjob+pitch.tsv'%(pathOutput), addJobDescription) #closing the files outputJson.close() return None
def tokenDictMakerFromFile(inputFilePath, outputFilePath=None): ''' ###NEED TO ANALYSE IF REMOVE IT AND REPLACE IT WITH makeTokenCountDictFromText() DEFINITELY ###################################################################### takes a corpus file, makes a dict of tokens with their count and dumps the result in a json file VERY SIMILAR TO makeTokenCountDictFromText() BUT MORE HANDS-ON AND SELF-BUILT ''' import utilsOs tokenDict = {} stringList = utilsOs.readAllLinesFromFile(inputFilePath, True) for string in stringList: tokenList = naiveRegexTokenizer(string.replace(u'/', u' ')) for token in tokenList: tokenDict[token] = tokenDict.get(token, 0.0) + (1.0 / len(stringList)) #we also add the lowercase version if there is an uppercase in the token if any(c.isupper() for c in token): tokenDict[token.lower()] = tokenDict.get( token.lower(), 0.0) + (1.0 / len(stringList)) if outputFilePath == None: outputFilePath = utilsOs.safeFilePath( inputFilePath.replace( inputFilePath.split(u'/')[-1], 'tokens.json')) utilsOs.dumpDictToJsonFile(tokenDict, outputFilePath) return tokenDict
def dumpSetToJson(aSet, pathOutput): ''' Mixes all given taxonomies/ontologies and returns a set of their content regardless of the hierarchy (flattens jobtitle at same level) ''' jsonDict = {} for elem in aSet: jsonDict[elem] = None utilsOs.dumpDictToJsonFile(jsonDict, pathOutput) return
def applyOnNotFlaggedForNHours(n=1): schedule = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/heurSchedule.json' scheduleDict = utilsOs.openJsonFileAsDict(schedule) # apply for n hours for nId in list(scheduleDict.keys()[:n]): indexesToApply = scheduleDict[nId] applyHeuristicsOnNotFlaggedCorpus(indexesToApply, nId) # remove from the dict once we dump the scores del scheduleDict[nId] # save the remaining schedule dict utilsOs.dumpDictToJsonFile(scheduleDict, pathOutputFile=schedule, overwrite=True)
def makeHourlyIndexDict(): folderPath = u'/data/rali5/Tmp/alfonsda/workRali/004tradBureau/006appliedHeuristics/NOT-FLAGGED/' with open(u'{0}files.paths'.format(folderPath)) as pathsFile: nbPaths = len(pathsFile.readlines()) id = 0 lastIdx = 0 aDict = {} for idx in range(0, nbPaths, 600): # range(364949, nbPaths, 600) aDict[id] = list(range(lastIdx, idx)) lastIdx = idx id += 1 aDict[id] = list(range(lastIdx, 394949)) schedule = u'{0}heurSchedule.json'.format(folderPath) utilsOs.dumpDictToJsonFile(aDict, pathOutputFile=schedule, overwrite=True)
def launchTmop(inputFilePath, pharaohFilePath, tokFilePath, outputFolderPath, **kwargs): utilsOs.createEmptyFolder(outputFolderPath) ########## # get and modif the config file configDict = getConfigTemplate() configDict["options"]["input file"] = inputFilePath configDict["options"]["align file"] = pharaohFilePath configDict["options"]["token file"] = tokFilePath configDict["options"]["output folder"] = outputFolderPath ########## # add a policy configDict["policies"].append(["FourNo", "on"]) configDict["policies"].append(["ThreeNo", "on"]) configDict["policies"].append(["TwoNo", "on"]) ########## # turn off certain heuristics to launch only the interesting ones # # configDict["filters"][0][1] = "off" # "SampleFilter" # configDict["filters"][1][1] = "off" # "LengthStats" # # configDict["filters"][2][1] = "off" # "LengthRatio" # # configDict["filters"][3][1] = "off" # "ReverseLengthRatio" # # configDict["filters"][4][1] = "off" # "WordRatio" # # configDict["filters"][5][1] = "off" # "ReverseWordRatio" # # configDict["filters"][6][1] = "off" # "WordLength" # # configDict["filters"][7][1] = "off" # "TagFinder" # # configDict["filters"][8][1] = "off" # "RepeatedChars" # # configDict["filters"][9][1] = "off" # "RepeatedWords" # # configDict["filters"][10][1] = "off" # "Lang_Identifier" # configDict["filters"][11][1] = "off" # "AlignedProportion" # # configDict["filters"][12][1] = "off" # "BigramAlignedProportion" # configDict["filters"][13][1] = "off" # "NumberOfUnalignedSequences" # configDict["filters"][14][1] = "off" # "LongestAlignedSequence" # configDict["filters"][15][1] = "off" # "LongestUnalignedSequence" # configDict["filters"][16][1] = "off" # "AlignedSequenceLength" # configDict["filters"][17][1] = "off" # "UnalignedSequenceLength" # configDict["filters"][18][1] = "off" # "FirstUnalignedWord" # configDict["filters"][19][1] = "off" # "LastUnalignedWord" # configDict["filters"][20][1] = "off" # "WE_Average" # configDict["filters"][21][1] = "off" # "WE_Median" # configDict["filters"][22][1] = "off" # "WE_BestAlignScore" # configDict["filters"][23][1] = "off" # "WE_ScoreOtherAlignment" # configDict["filters"][24][1] = "off" # "WE_ScoreAlign_BestForRest" for k, v in kwargs: configDict["options"][k] = v # dump the config.json file tmopFolder = "/data/rali5/Tmp/alfonsda/workRali/004tradBureau/TMOP-master" utilsOs.dumpDictToJsonFile(configDict, "{0}/config.json".format(tmopFolder), overwrite=True)
def makeDictFromTsvTrain(pathToTrainTsv, trainingDataColumnName, goldStandardColumnName, trainedDict=None, outputDictFilePath=None, preOrorazeOrig=False, alignMostSimilar=False): ''' Given a path to a "train" file, applies a heuristic dict maker it opens it, searches for all possible general language spell corrections and chooses the ones reappearing somewhere else in the corpus ''' trainedDict = trainedDict if trainedDict != None else {} #open the train dataframe from the path trainDf = utilsOs.getDataFrameFromArgs(pathToTrainTsv) #get the specific data we want to use as train to populate our dict (original and gold standard) trainDataDf = trainDf[[trainingDataColumnName, goldStandardColumnName]] #get the gold standard data to which compare the training data for indexRow, row in trainDataDf.iterrows(): #get the elements not matching exactly if preOrorazeOrig == False: nonMatchingAlignment = getNonExactMatch((row[0]), row[1]) #we preororaze if asked to limit the difference in segmentation and orora-syntax-oriented problems else: nonMatchingAlignment = getNonExactMatch(advancedOroraze(row[0]), row[1]) #if the list is not empty if nonMatchingAlignment: for nonMatchTupl in nonMatchingAlignment: #use the original token as a key trainedDict[nonMatchTupl[0]] = trainedDict.get( nonMatchTupl[0], list()) + [nonMatchTupl[1]] #clean the dict for origKey, goldValList in dict(trainedDict).items(): #eliminate all the elements in the dict that have multiple possible outputs or if the value is an empty symbol if len(goldValList) != 1 or goldValList[0] == u'∅': del trainedDict[origKey] else: trainedDict[origKey] = goldValList[0] #dump the dict if outputDictFilePath != None: utilsOs.dumpDictToJsonFile(trainedDict, outputDictFilePath, overwrite=True) return trainedDict
def quadrigramDictMakerFromFile(inputFilePath, outputFilePath=None): ''' takes a corpus file, makes a dict of 4grams with their cooccurrence and dumps the result in a json file ''' quadrigramDict = {} stringList = utilsOs.readAllLinesFromFile(inputFilePath, True) langString = u' '.join(stringList) for i in range(len(langString) - 3): quadrigramDict[langString[i:i + 4]] = quadrigramDict.get( langString[i:i + 4], 0.0) + (1.0 / len(stringList)) if outputFilePath == None: outputFilePath = utilsOs.safeFilePath( inputFilePath.replace( inputFilePath.split(u'/')[-1], 'quadrigrams.json')) utilsOs.dumpDictToJsonFile(quadrigramDict, outputFilePath) return quadrigramDict
def trigramDictMakerFromFile(inputFilePath, outputFilePath=None): ''' takes a corpus file, makes a dict of character 3grams with their count and dumps the result in a json file ''' import utilsOs trigramDict = {} stringList = utilsOs.readAllLinesFromFile(inputFilePath, True) langString = u' '.join(stringList) for i in range(len(langString) - 2): trigramDict[langString[i:i + 3]] = trigramDict.get( langString[i:i + 3], 0.0) + (1.0 / len(stringList)) if outputFilePath == None: outputFilePath = utilsOs.safeFilePath( inputFilePath.replace( inputFilePath.split(u'/')[-1], 'trigrams.json')) utilsOs.dumpDictToJsonFile(trigramDict, outputFilePath) return trigramDict
def tokenDictMakerFromFile(inputFilePath, outputFilePath=None): ''' takes a corpus file, makes a dict of tokens with their cooccurrence and dumps the result in a json file ''' tokenDict = {} stringList = utilsOs.readAllLinesFromFile(inputFilePath, True) for string in stringList: tokenList = naiveRegexTokenizer(string.replace(u'/', u' ')) for token in tokenList: tokenDict[token] = tokenDict.get(token, 0.0) + (1.0 / len(stringList)) #we also add the lowercase version if there is an uppercase in the token if any(c.isupper() for c in token): tokenDict[token.lower()] = tokenDict.get( token.lower(), 0.0) + (1.0 / len(stringList)) if outputFilePath == None: outputFilePath = utilsOs.safeFilePath( inputFilePath.replace( inputFilePath.split(u'/')[-1], 'tokens.json')) utilsOs.dumpDictToJsonFile(tokenDict, outputFilePath) return tokenDict
def modifyConfigAndIndexFiles(pathToTheExportationEnvironment): ''' given the path to the sigma.js exportation environment (ending in the folder "network/"), it changes the config.json file and the index.html file so they show the graph the way intended ''' #copying config.json file configContent = {"type": "network","version": "1.0","data": "data.json","logo": {"file": "","link": "","text": ""},"text": {"more": "","intro": "","title": ""},"legend": {"edgeLabel": "","colorLabel": "","nodeLabel": ""},"features": {"search": True,"groupSelectorAttribute": True,"hoverBehavior": "default"},"informationPanel": {"groupByEdgeDirection": True,"imageAttribute": False},"sigma": {"drawingProperties": {"defaultEdgeType": "curve","defaultHoverLabelBGColor": "#002147","defaultLabelBGColor": "#ddd","activeFontStyle": "bold","defaultLabelColor": "#000","labelThreshold": 999,"defaultLabelHoverColor": "#fff","fontStyle": "bold","hoverFontStyle": "bold","defaultLabelSize": 14},"graphProperties": {"maxEdgeSize": 2,"minEdgeSize": 2,"minNodeSize": 0.25,"maxNodeSize": 2.5},"mouseProperties": {"maxRatio": 20,"minRatio": 0.75}}} pathConfigJson = u'{0}config.json'.format(pathToTheExportationEnvironment) if utilsOs.theFileExists(pathConfigJson) == True: os.remove(pathConfigJson) utilsOs.dumpDictToJsonFile(configContent, pathConfigJson) #getting the color information from the data file colorCommunityDict = {} dataDict = utilsOs.openJsonFileAsDict(u'{0}data.json'.format(pathToTheExportationEnvironment)) for nodeDict in dataDict[u'nodes']: try: if nodeDict[u'attributes'][u'community_lvl_0'] not in colorCommunityDict: colorCommunityDict[nodeDict[u'attributes'][u'community_lvl_0']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name_lvl_0']) ''' ##################################################### #before I changed the names of the columns if nodeDict[u'attributes'][u'community'] not in colorCommunityDict: colorCommunityDict[nodeDict[u'attributes'][u'community']] = u'\t\t\t<div style="color: {0};">● {1}</div>\n'.format(nodeDict[u'color'], nodeDict[u'attributes'][u'infered_community_name']) ''' except KeyError: pass #modifying the index.html file with open(u'{0}index.html'.format(pathToTheExportationEnvironment)) as indexFile: fileLines = indexFile.readlines() for index, line in enumerate(fileLines): if line == u'\t\t<dt class="colours"></dt>\n': indexDivisor = index + 1 break fileLines = fileLines[:indexDivisor] + [u'\t\t<dd>\n'] + list(colorCommunityDict.values()) + [u'\t\t</dd>\n'] + fileLines[indexDivisor+1:] utilsOs.dumpRawLines(fileLines, u'{0}index.html'.format(pathToTheExportationEnvironment), addNewline=False, rewrite=True)
def analyseNodeListStrDistance(nodeListPath, outputPath=None): ''' analyses the nodes in the node list and returns the stats concerning the similarities between node string labels ''' import multiprocessing as mp pool = mp.Pool(processes=4) nodeSimilarsDict = {1: {}, 2: {}, 3: {}} nodeSetJobTitles = set() nodeSetSkills = set() #put the node Labels in a set with open(nodeListPath) as nodeFile: nodeData = nodeFile.readline() while nodeData: #get the data for each row nodeDataList = nodeData.split(u'\t') #we make sure we are not in the header if nodeDataList[0] != u'Id': #save the node id/label in a set if u'__s' in nodeDataList[0]: nodeSetJobTitles.add(nodeDataList[1]) elif u'__t' in nodeDataList[0]: nodeSetSkills.add(nodeDataList[1]) #get next line nodeData = nodeFile.readline() #get the number and list of N-similar nodes for each Job title node jobtitleResults = [ pool.apply_async(getElemSimilarByEditDistanceOfN, args=(original, nodeSetJobTitles, nodeSimilarsDict, True, u'{0}__s'.format(original))) for original in nodeSetJobTitles ] #get the number and list of N-similar nodes for each skill node skillResults = [ pool.apply_async(getElemSimilarByEditDistanceOfN, args=(original, nodeSetSkills, nodeSimilarsDict, True, u'{0}__t'.format(original))) for original in nodeSetSkills ] #merge all the obtained dict together def merge_two_dicts(x, y): w = x.copy() for nb in range(1, 4): z = w[nb].copy() # start with x's keys and values z.update( y[nb]) # modifies z with y's keys and values & returns None w[nb] = z return w #prepare the objects containing the results dictResults = {1: {}, 2: {}, 3: {}} for dictToBeAdded in tqdm(jobtitleResults + skillResults): dictResults = merge_two_dicts(dictResults, dictToBeAdded.get()) #dump into a json file if outputPath != None: utilsOs.dumpDictToJsonFile(dictResults, outputPath, overwrite=True) #get the summary of the results countResultStrDistanceDict(dictResults) return dictResults
def getCanonLensInfo(): canonDict = {} canonUrl = u'https://global.canon/en/c-museum/series_search.html?t=lens&s=ef' # open the selenium driver driver = webdriver.Firefox() driver.get(canonUrl) # get te lens keys for nb in range(2, 300): try: lensName = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div[4]/div/div[{0}]'.format( nb)) lensUrl = ( lensName.find_element_by_tag_name('a')).get_attribute(u'href') time.sleep(random.uniform(0.2, 0.5)) except NoSuchElementException: try: driver.execute_script( 'window.scrollTo(0, document.body.scrollHeight*2);') time.sleep(random.uniform(0.5, 0.8)) lensName = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div[4]/div/div[{0}]'. format(nb)) lensUrl = (lensName.find_element_by_tag_name('a') ).get_attribute(u'href') except NoSuchElementException: break canonDict[str(lensName.text).replace(u'\n', u' ')] = {u'url': lensUrl} # print(lensName.text.replace(u'/n', u' ')) # open each lens url to get the specs for lName, lensDict in canonDict.items(): print(lName) driver.get(lensDict[u'url']) time.sleep(random.uniform(0.8, 1.1)) lensList = lName.split(u' ') # get the mount type and focal length focalLength = lensList[0] if lensList[0] != u'Extender' else lensList[1] for mount in [ u'EF-S', u'EF-M', u'EF-R', u'FD', u'EF', u'TS-E', u'MP-E' ]: if mount in lName: lensMount = mount focalLength = focalLength.replace(mount, u'') break canonDict[lName][u'mount type'] = lensMount canonDict[lName][u'focal length'] = focalLength # aperture speed (f number) canonDict[lName][u'aperture speed'] = lensList[1] if u'f/' in lensList[ 1] else u'NaN' # AF af = u'unk' for afType in [u'USM', u'STM']: if afType in lName: af = afType break canonDict[lName][u'AF type'] = af # mark mark = u'I' for markNb in [u'II', u'III', u'IV', u'V', u'VI']: if markNb in lName: mark = markNb break canonDict[lName][u'mark'] = mark # IS canonDict[lName][ u'image stabilization'] = u'IS' if u'IS' in lName else u'NaN' # lens type lensType = u'NaN' for lType in [u'Macro', u'Fisheye', u'Extender']: if lType in lName: lensType = lType break canonDict[lName][u'lens type'] = lensType # luxe series canonDict[lName][u'L series'] = u'L' if u'L' in lensList[1] else u'NaN' # get the rest of the specs row = 1 canonDict[lName][u'release date'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text row += 1 canonDict[lName][u'original price'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text row += 1 canonDict[lName][ u'nb of construction groups'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text row += 1 canonDict[lName][ u'nb of construction elements'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text if u'Diaphragm' in driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[1]' .format(row + 1)).text: row += 1 canonDict[lName][ u'nb of diaphragm blades'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text if u'Minimum' in driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[1]' .format(row + 1)).text: row += 1 canonDict[lName][u'min aperture'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text if u'Closest' in driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[1]' .format(row + 1)).text: row += 1 canonDict[lName][ u'closest focus distance (m)'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text row += 1 canonDict[lName][ u'max magnification (x)'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text if u'Filter' in driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[1]' .format(row + 1)).text: row += 1 canonDict[lName][u'filter diam (mm)'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text row += 1 canonDict[lName][ u'max diam X length (mm)'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text row += 1 canonDict[lName][u'weight (g)'] = driver.find_element_by_xpath( u'/html/body/div[2]/div/div/main/div/div[2]/div[3]/table/tbody/tr[{0}]/td[2]' .format(row)).text # dump utilsOs.dumpDictToJsonFile(canonDict, pathOutputFile='./data/canonLensInfo.json', overwrite=True) driver.close()
def reformatFilesPreGiza(pathToEnFile, pathToFrFile, overwrite=True): """ make 2 vocabulary files (occurrence dict) in the format needed by giza++ or mgiza++ then reformats the corpus into a the format needed by giza++ or mgiza++ :param pathToEnFile: path to the english sentences file :param pathToFrFile: path to the french sentences file :return: None """ # prepare the output paths outputEnPath = prepareOutPutFile(pathToEnFile, fileName=u"sourceEn.vcb") outputFrPath = prepareOutPutFile(pathToFrFile, fileName=u"targetFr.vcb") outputPathGizaFormatCorpus = prepareOutPutFile( pathToEnFile, fileName=u"sentenceFile.giza") outputEnDictPath = prepareOutPutFile(pathToEnFile, fileName=u"en.json") outputFrDictPath = prepareOutPutFile(pathToEnFile, fileName=u"fr.json") outputSpDictPath = prepareOutPutFile(pathToEnFile, fileName=u"sp.json") outputEnIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"enId.json") outputFrIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"frId.json") # if there is not a file there yet, open the corpus Files, count the frequency of each token if overwrite is True or os.path.isfile(outputEnDictPath) is False: # make the frequency dict enTokFreqDict = makeFreqDict(pathToEnFile, lang=u"en") frTokFreqDict = makeFreqDict(pathToFrFile, lang=u"fr") # open the corpus files count the frequency of the sentence pairs spFreqDict = makeSPfreqDict(pathToEnFile, pathToFrFile) # sort the dict by freq orderedKeysValuesEn = sorted(enTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) orderedKeysValuesFr = sorted(frTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # make the id dict enIdDict = makeIdDict(orderedKeysValuesEn) frIdDict = makeIdDict(orderedKeysValuesFr) # dump dicts utilsOs.dumpDictToJsonFile(enTokFreqDict, outputEnDictPath, overwrite) utilsOs.dumpDictToJsonFile(frTokFreqDict, outputFrDictPath, overwrite) utilsOs.dumpDictToJsonFile(spFreqDict, outputSpDictPath, overwrite) utilsOs.dumpDictToJsonFile(enIdDict, outputEnIdDictPath, overwrite) utilsOs.dumpDictToJsonFile(frIdDict, outputFrIdDictPath, overwrite) # if the file already exists or if overwrite is false else: enTokFreqDict = utilsOs.openJsonFileAsDict(outputEnDictPath) frTokFreqDict = utilsOs.openJsonFileAsDict(outputFrDictPath) spFreqDict = utilsOs.openJsonFileAsDict(outputSpDictPath) enIdDict = utilsOs.openJsonFileAsDict(outputEnIdDictPath) frIdDict = utilsOs.openJsonFileAsDict(outputFrIdDictPath) # sort the dict by freq orderedKeysValuesEn = sorted(enTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) orderedKeysValuesFr = sorted(frTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # dump the empty tok voc file if overwrite is True: firstLine = u"1\tUNK\t0" utilsOs.createEmptyFile(outputEnPath, headerLine=firstLine) utilsOs.createEmptyFile(outputFrPath, headerLine=firstLine) utilsOs.createEmptyFile(outputPathGizaFormatCorpus) # dump the dict in the tok voc file for indKv, kv in enumerate(orderedKeysValuesEn): stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1]) utilsOs.appendLineToFile(stringLine, outputEnPath, addNewLine=True) for indKv, kv in enumerate(orderedKeysValuesFr): stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1]) utilsOs.appendLineToFile(stringLine, outputFrPath, addNewLine=True) # transform and dump the corpus into the GIZA format appendToDumpInGizaFormat(pathToEnFile, pathToFrFile, outputPathGizaFormatCorpus, enIdDict, frIdDict, spFreqDict) return outputEnPath, outputFrPath, outputPathGizaFormatCorpus, outputEnDictPath, outputFrDictPath, outputSpDictPath
def extractMisalignedSP(pathToSrcTrgtFiles, extractionSize=100, typeOfExtractors=[0, 1, 2]): """ given a path to the original source and target files, and the types of extractors to be used returns SP (sentence pairs) extracted as misaligned extractor types: - 0 : same number presence in src and trgt - 1 : 4 or less than 4 tokens - 2 : """ extractedSp = {0: {}, 1: {}, 2: {}} totalLines = 0 # get name of subset for subset in [ u'/ALIGNMENT-QUALITY', u'/MISALIGNED', u'/NOT-FLAGGED', u'/QUALITY' ]: if subset in pathToSrcTrgtFiles: subsetName = subset # type 1 block output1Path = u'./003negativeNaiveExtractors/numberCoincidence/' utilsOs.createEmptyFolder(output1Path) # type 2 block output1Path = u'./003negativeNaiveExtractors/fewTokens/' utilsOs.createEmptyFolder(output1Path) # type 3 block output2Path = u'./003negativeNaiveExtractors/cognates/' utilsOs.createEmptyFolder(output2Path) # get the path to the src and trgt files srcTrgtFiles = utilsOs.goDeepGetFiles(pathToSrcTrgtFiles, format=u'.tmx') print(u'TOTAL FILES : ', len(srcTrgtFiles)) for filePath in srcTrgtFiles: srcFilePath = u'{0}.en'.format( filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath) trgtFilePath = u'{0}.fr'.format( filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath) # open line by line and apply extractors try: with open(srcFilePath) as srcFile: with open(trgtFilePath) as trgtFile: srcLines = srcFile.readlines() trgtLines = trgtFile.readlines() for srcLnIndex, srcLn in enumerate(srcLines): trgtLn = trgtLines[srcLnIndex] # tokenize srcLn = srcLn.lower().replace(u' pm', u'pm') trgtLn = trgtLn.lower().replace(u' pm', u'pm') addSeparators = [ u'.', u',', u':', u'/', u'-', u"''", u"'" ] srcTokens = utilsString.nltkTokenizer( srcLn, addSeparators) trgtTokens = utilsString.nltkTokenizer( trgtLn, addSeparators) # apply the extractors if 0 in typeOfExtractors: extractedSp, score = applyExtractor( nbMismatch, 0.75, srcTokens, trgtTokens, extractedSp, filePath, 0, int(srcLnIndex)) if 1 in typeOfExtractors: # get context scores and location in doc cntxtScores = getContextScores( srcLnIndex, srcLines, trgtLines) docLoc = srcLnIndex / len(srcLines) extractedSp, score = applyExtractor( tableOfContents, 0.32, srcTokens, trgtTokens, extractedSp, filePath, 1, int(srcLnIndex), contextScores=cntxtScores, placeInDocument=docLoc) if 2 in typeOfExtractors: extractedSp, score = applyExtractor( cognateCoincidence, 0.1, srcTokens, trgtTokens, extractedSp, filePath, 2, int(srcLnIndex)) totalLines += len(srcLines) # some folders have no .en and .fr to each .tmx file # (e.g.: '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/241-CAN_CENT_OCC_HEALTH/SAFE/en-fr/') except FileNotFoundError: pass print(u'TOTAL LINES : ', totalLines) # dump the extracted sp dict into a json file utilsOs.dumpDictToJsonFile( extractedSp, pathOutputFile=u'./003negativeNaiveExtractors/000extractedSp.json', overwrite=True) # randomly extract and dump the file path and the line index for the extracted SP randomlyExtractAndDump(extractedSp, 100, subsetName)