def makeARefFile(rootFolder=u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/", refFilePath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"): # make sure the files does not yet exists if utilsOs.theFileExists(refFilePath) is True: return None utilsOs.createEmptyFile(refFilePath) listOfFiles = utilsOs.goDeepGetFiles(u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/", format=u".tmx.en") with open(refFilePath, u"a") as refFile: for filePath in listOfFiles: refFile.write(u"{0}\t-1\n".format(filePath.replace(u".tmx.en", u".tmx")))
def dumpJobTitleAndDescription(jsonDict, pathOutputFile='./job+pitch.tsv', addJobDescription=False): ''' Saves the basic linkedIn job data in a tsv file each line shows: - one job title (or variant) - its description(s) / personal pitch(es) / NA (non applicable) ''' #file of job titles names (and optional description) with utilsOs.createEmptyFile(pathOutputFile) as outputJobtitles: for jobTitle, jobData in jsonDict.items(): #only the job title if addJobDescription == False: content = u'%s\n' % (jobTitle) #job title + description else: #if there is one or multiple specific description of the job if u'description' in jobData and len( jobData[u'description']) > 0: content = u'%s\t%s\n' % (jobTitle, u' _#####_ '.join( jobData[u'description'])) else: #if there is one or multiple personal pitches that might give us an idea of what is the job if u'pitch' in jobData and len(jobData[u'pitch']) > 0: content = u'%s\t%s\n' % (jobTitle, u' _#####_ '.join( jobData[u'pitch'])) #if there is nothing then it's Non Applicable else: content = u'%s\t%s\n' % (jobTitle, u'NA') #dumping to file outputJobtitles.write(content) return
def edgeListTemp(pathInput, pathTempFile, lowercaseItAll=False): ''' takes the linkedin data and makes a temporary file that is an edge list of (columns): - jobNode(source) - skillNode(target) It's only a temporary file because we still need to erase doubles, to make the weight (count coreference of skills) and count how many times the job titles appeared ''' #we open a temp file outputTempFile = utilsOs.createEmptyFile(pathTempFile) #don't specify that the headerLine is 'Source \t Target' with open(pathInput) as jsonFile: #we read the original json file line by line jsonData = jsonFile.readline() while jsonData: jsonDict = json.loads(jsonData) #if there are experiences if u'experiences' in jsonDict: #reliable job-skill correspondence if we only have one job title if len(jsonDict[u'experiences']) == 1: if u'function' in jsonDict[u'experiences'][0]: jobTitle = jsonDict[u'experiences'][0][u'function'] if lowercaseItAll != False: jobtitle = jobTitle.lower() if u'skills' in jsonDict: for skillDict in jsonDict[u'skills']: skill = skillDict[u'name'] if lowercaseItAll != False: skill = skill.lower() outputTempFile.write(u'{0}\t{1}\n'.format(jobTitle, skill)) #outputTxt.write(u'{0}\t{1}\n'.format(jobTitle, skill)) jsonData = jsonFile.readline() #closing the file outputTempFile.close()
def nodeListIdType(pathEdgeListFile, pathNodeFileOutput): ''' opens the temp file containing the extracted linkedin data and makes a node list of (columns): - id(same as label) - label(jobTitle / skill node) - type(source or target; 2 for source 1 for target) #the job title is always the source, the skill is always the target ''' jobTitleSet = set() skillSet = set() #open the output file outputTxt = utilsOs.createEmptyFile(pathNodeFileOutput, headerLine=u'Id\tLabel\tNodeType') with codecs.open(pathEdgeListFile, u'r', encoding=u'utf8') as edgeData: dataLine = edgeData.readline() while dataLine: dataList = dataLine.replace(u'\n', u'').split(u'\t') if len(dataList) > 1: #add to the jobTitle (source) set jobTitleSet.add(dataList[0]) #add to the skill (target) set skillSet.add(dataList[1]) #get to the next line dataLine = edgeData.readline() #browse the data sets to dump them for jobTitle in jobTitleSet: outputTxt.write(u'{0}\t{1}\t{2}\n'.format(jobTitle, jobTitle.replace(u'__s', u''), 2)) #id's '_s' means 'source', 2 means 'source' for skill in skillSet: outputTxt.write(u'{0}\t{1}\t{2}\n'.format(skill, skill.replace(u'__t', u''), 1)) #id's '_t' means 'target', 1 means 'target'
def makeJobSetFromLinkedIn(pathInput, lowercaseItAll=False, pathOutput=None, n=float('inf')): ''' makes a set of jobs taken from linkedIn profiles IF a "pathOutput" is given we save the set as a txt file of one job per line IF an "n" argument is given it makes a samble containing how many profiles necesary to achieve N functions (jobtitles) ''' jobSet = set() #open the file if pathOutput != None: outputTxt = utilsOs.createEmptyFile(u'%slistOfJobs.linkedIn' %(pathOutput)) #we read the original json file line by line with open(pathInput) as jsonFile: #if we want to make a sample n must be an int otherwise it will keep going until eof jsonData = jsonFile.readline() while jsonData and len(jobSet) < n: jobDict = getJobData(json.loads(jsonData)) #we add the job titles jobsToAdd = set(jobDict.keys()) #if we want them lowercased if lowercaseItAll != False: jobsToAdd = set([e.lower() for e in jobsToAdd]) #we dump each job title into the txt file if pathOutput != None: for jobTitle in jobsToAdd: outputTxt.write(u'{0}\n'.format(jobTitle)) #adding to the set jobSet = jobSet.union(jobsToAdd) #nextLine jsonData = jsonFile.readline() #closing the file if pathOutput != None: outputTxt.close() return jobSet
def makeSampleFileHavingNJobTitles(pathInput, pathOutput, n=1000000, addJobDescription=False): ''' takes the real linkedIn data and makes a sample containing how many profiles necesary to achieve N functions (jobtitles) ''' dictJobTitlesData = {} #sample of all candidates data outputJson = utilsOs.createEmptyFile(u'%ssample.json' %(pathOutput)) #we read the original json file line by line with codecs.open(pathInput, u'r', encoding=u'utf8') as jsonFile: while len(dictJobTitlesData) < n: jsonData = jsonFile.readline() #we dump each line into the sample file outputJson.write(jsonData.replace(u'\r', '')) #we make a dict out of the string line jsonLine = utilsOs.convertJsonLineToDict(jsonData) if jsonLine != None: #we dump each job title into the jobtitle file dictJobTitlesData = getJobData(jsonLine, dictJobTitlesData) #dumping dict content in json utilsOs.dumpDictToJsonFile(dictJobTitlesData, pathOutputFile=u'%sjobTitlesDataDict.json'%(pathOutput)) #SIMPLIFIED DATA dumping job title (and optional dexcription) to a file dumpJobTitleAndDescription(dictJobTitlesData, u'%sjob+pitch.tsv'%(pathOutput), addJobDescription) #closing the files outputJson.close() return None
def getANewSpWhereWeLeftOff(refPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"): # check if the ref file already exists if utilsOs.theFileExists(refPath) is False: utilsOs.createEmptyFile(refPath) # open the reference file lastSeenIndex, lastSeenPath = None, None with open(refPath) as ref: # first line refLns = ref.readlines() refIndex = 0 for refLn in refLns: refList = refLn.replace(u"\n", u"").split(u"\t") # test if we have an index for the path try: lastSeenIndex = int(refList[1]) lastSeenPath = refList[0] break # if there is no integral, then it saw all lns for that path except ValueError: pass # next ref index refIndex += 1 # open the last seen file at the (last seen index + 1) and return the sp in the en and fr files if lastSeenIndex is None: return None with open(u"{0}.en".format(lastSeenPath)) as enFile: with open(u"{0}.fr".format(lastSeenPath)) as frFile: enLn = enFile.readline() frLn = frFile.readline() indexLn = 0 while enLn: if indexLn == lastSeenIndex+1: # replace the line with its next index and dump the ref file refLns[refIndex] = u"{0}\t{1}\n".format(lastSeenPath, indexLn) # return the sentence pair return [enLn.replace(u"\n", u""), frLn.replace(u"\n", u"")], lastSeenPath, indexLn, refLns # next line enLn = enFile.readline() frLn = frFile.readline() indexLn += 1 # if we went over the whole document and it ended, change the ref line, dump it and start over refLns[refIndex] = u"{0}\tdone\n".format(lastSeenPath) utilsOs.dumpRawLines(refLns, refPath, addNewline=False, rewrite=True) return getANewSpWhereWeLeftOff(refPath)
def makeSimpleTokenDatasetFromTsv(tsvInputFilePath, originalStringColumnName, correctStringColumnName, outputFilePath, outputOriginalColumnName=u'input', outputCorrectColumnName=u'output', caseSensitive=True): ''' takes a tsv file, naively space-char tokenizes the content in the original and correct columns, makes correspond the original and correct tokens and dumps each token in a row of an output tsv file ''' total = 0 nonErrors = 0 #create empty output file outputFile = utilsOs.createEmptyFile( outputFilePath, u'{0}\t{1}\n'.format(outputOriginalColumnName, outputCorrectColumnName)) #browse the input file line by line with open(tsvInputFilePath, u'r', encoding=u'utf8') as inputFile: #header line headerList = (inputFile.readline().replace(u'\n', u'')).split(u'\t') #find the column indexes corresponding to the column names originalIndex = headerList.index( originalStringColumnName ) if originalStringColumnName in headerList else 0 correctIndex = headerList.index( correctStringColumnName ) if correctStringColumnName in headerList else 1 #first line line = inputFile.readline() while line: #case sensibility if caseSensitive != True: line = line.lower() #get the list of elements in the line lineList = (line.replace(u'\n', u'')).split(u'\t') #get the tokens in the original string (in this case: tokens = space char separated elements) originalTokens = lineList[originalIndex].split(u' ') #get the erratic correspondences between the original string and the correct string errorTokens = utilsString.getcorrespondingTokensAndEditDist( lineList[originalIndex], lineList[correctIndex], caseSensitive) for origToken in originalTokens: #write the non problematic tokens if origToken not in [tupl[0] for tupl in errorTokens]: outputFile.write(u'{0}\t{1}\n'.format( origToken, origToken)) nonErrors += 1 #write the problematic ones for tupl in errorTokens: outputFile.write(u'{0}\t{1}\n'.format(tupl[0], tupl[1])) total += 1 line = inputFile.readline() print(nonErrors, total, nonErrors / total) return nonErrors, total
def makeGoldStandardOrora(inputPath, outputPath, goldStandardPath): ''' opens the input and output file and makes one input with corresponding output file ''' #INCLUDING the information block columns ###headerLine = u'Id\tCommentIn\tCommentOut\tInformation blocks\tCoded information block\tBlock type' #EXCLUSING the information block columns headerLine = u'Id\tCommentIn\tCommentOut' #create empty gold standard file gsFile = utilsOs.createEmptyFile(goldStandardPath, headerLine) #browse the output file line by line with open(outputPath, u'r', encoding=u'utf8') as outFile: #header line headerList = (outFile.readline().replace(u'\n', u'')).split(u'\t') idCodeColName, commentColName = headerList[1], headerList[2] #dataframe inputDf = utilsGraph.getDataFrameFromArgs(inputPath) line = outFile.readline() #populate edge list while line: #get data lineList = (line.replace(u'\n', u'')).split(u'\t') #we replace any tabulation in the comments with ' ___ ' so there are no inconsistencies with the tsv (tab based) idCode, commentOutLabel, theRest = lineList[1], lineList[2].replace(u'\t', u' ___ '), lineList[3:] #select selectInputDf = inputDf.loc[ inputDf[u'Id'] == int(idCode) ] #get the input comment commentInLabel = ( selectInputDf.loc[ selectInputDf[idCodeColName] == int(idCode) ] )[u'Comment'].tolist() if len(commentInLabel) == 1: commentInLabel = commentInLabel[0].replace(u'\t', u' ___ ') else: print('there are multiple rows with the same ID code', idCode, commentInLabel) #write the line INCLUDING the information block columns ###gsFile.write( u'{0}\t{1}\t{2}\t{3}\n'.format(idCode, commentInLabel, commentOutLabel, u'\t'.join(theRest)) ) #write the line EXCLUDING the information block columns gsFile.write( u'{0}\t{1}\t{2}\n'.format(idCode, commentInLabel, commentOutLabel) ) #next line line = outFile.readline() #close the file gsFile.close() #remove the row doubles gsDf = utilsGraph.getDataFrameFromArgs(goldStandardPath) gsDf = gsDf.drop_duplicates() gsDf.to_csv(goldStandardPath, sep='\t', index=False) return gsDf
def edgeListDump(pathTempFile, pathOutput): ''' opens the temp file containing the extracted linkedin data and makes an edge list of (columns): - jobNode(source) - skillNode(target) - weight(coreference) - nbOfTimesJobTitleAppeared [in a further function we might want to add keywords (non stop-words most common tokens for each jobtitle)] ''' skillCorefDict = {} jobTitleCorefDict = {} lastJobTitle = None lineSet = set() #open the output file outputTxt = utilsOs.createEmptyFile(pathOutput, headerLine=u'Source\tTarget\tWeight\tWeight1') #we browse the data once to get the weight and nbOfTimesJobTitleAppeared data with codecs.open(pathTempFile, u'r', encoding=u'utf8') as tempData: dataLine = tempData.readline() while dataLine: dataList = dataLine.replace(u'\n', u'').split(u'\t') if len(dataList) > 1: #count the skills coref skillCorefDict[dataList[1]] = skillCorefDict.get(dataList[1], 0) + 1 #count the repetitions of job titles if dataList[0] != lastJobTitle: jobTitleCorefDict[dataList[0]] = jobTitleCorefDict.get(dataList[0], 0) + 1 lastJobTitle = dataList[0] #we add the line to the set lineSet.add(dataLine) #get to the next line dataLine = tempData.readline() #we browse the data a second time to dump it for dataLine in lineSet: dataList = dataLine.replace(u'\n', u'').split(u'\t') #we write 2 possible edge weights: skill coreference & skill coreference*jobtitle coreference outputTxt.write(u'{0}__s\t{1}__t\t{2}\t{3}\n'.format(dataList[0], dataList[1], skillCorefDict[dataList[1]], skillCorefDict[dataList[1]]*jobTitleCorefDict[dataList[0]])) #closing the file outputTxt.close()
def reformatFilesPreGiza(pathToEnFile, pathToFrFile, overwrite=True): """ make 2 vocabulary files (occurrence dict) in the format needed by giza++ or mgiza++ then reformats the corpus into a the format needed by giza++ or mgiza++ :param pathToEnFile: path to the english sentences file :param pathToFrFile: path to the french sentences file :return: None """ # prepare the output paths outputEnPath = prepareOutPutFile(pathToEnFile, fileName=u"sourceEn.vcb") outputFrPath = prepareOutPutFile(pathToFrFile, fileName=u"targetFr.vcb") outputPathGizaFormatCorpus = prepareOutPutFile( pathToEnFile, fileName=u"sentenceFile.giza") outputEnDictPath = prepareOutPutFile(pathToEnFile, fileName=u"en.json") outputFrDictPath = prepareOutPutFile(pathToEnFile, fileName=u"fr.json") outputSpDictPath = prepareOutPutFile(pathToEnFile, fileName=u"sp.json") outputEnIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"enId.json") outputFrIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"frId.json") # if there is not a file there yet, open the corpus Files, count the frequency of each token if overwrite is True or os.path.isfile(outputEnDictPath) is False: # make the frequency dict enTokFreqDict = makeFreqDict(pathToEnFile, lang=u"en") frTokFreqDict = makeFreqDict(pathToFrFile, lang=u"fr") # open the corpus files count the frequency of the sentence pairs spFreqDict = makeSPfreqDict(pathToEnFile, pathToFrFile) # sort the dict by freq orderedKeysValuesEn = sorted(enTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) orderedKeysValuesFr = sorted(frTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # make the id dict enIdDict = makeIdDict(orderedKeysValuesEn) frIdDict = makeIdDict(orderedKeysValuesFr) # dump dicts utilsOs.dumpDictToJsonFile(enTokFreqDict, outputEnDictPath, overwrite) utilsOs.dumpDictToJsonFile(frTokFreqDict, outputFrDictPath, overwrite) utilsOs.dumpDictToJsonFile(spFreqDict, outputSpDictPath, overwrite) utilsOs.dumpDictToJsonFile(enIdDict, outputEnIdDictPath, overwrite) utilsOs.dumpDictToJsonFile(frIdDict, outputFrIdDictPath, overwrite) # if the file already exists or if overwrite is false else: enTokFreqDict = utilsOs.openJsonFileAsDict(outputEnDictPath) frTokFreqDict = utilsOs.openJsonFileAsDict(outputFrDictPath) spFreqDict = utilsOs.openJsonFileAsDict(outputSpDictPath) enIdDict = utilsOs.openJsonFileAsDict(outputEnIdDictPath) frIdDict = utilsOs.openJsonFileAsDict(outputFrIdDictPath) # sort the dict by freq orderedKeysValuesEn = sorted(enTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) orderedKeysValuesFr = sorted(frTokFreqDict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True) # dump the empty tok voc file if overwrite is True: firstLine = u"1\tUNK\t0" utilsOs.createEmptyFile(outputEnPath, headerLine=firstLine) utilsOs.createEmptyFile(outputFrPath, headerLine=firstLine) utilsOs.createEmptyFile(outputPathGizaFormatCorpus) # dump the dict in the tok voc file for indKv, kv in enumerate(orderedKeysValuesEn): stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1]) utilsOs.appendLineToFile(stringLine, outputEnPath, addNewLine=True) for indKv, kv in enumerate(orderedKeysValuesFr): stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1]) utilsOs.appendLineToFile(stringLine, outputFrPath, addNewLine=True) # transform and dump the corpus into the GIZA format appendToDumpInGizaFormat(pathToEnFile, pathToFrFile, outputPathGizaFormatCorpus, enIdDict, frIdDict, spFreqDict) return outputEnPath, outputFrPath, outputPathGizaFormatCorpus, outputEnDictPath, outputFrDictPath, outputSpDictPath
def applyNormalisationGetResult(testFilePath, normOutPath=None, ororazeOutput=(True, True), useAbbrDict=False, normalizationFunction=None, *args): ''' if normalizationFunction is none, then it will create the baseline otherwise it will aplly the normalization function, ororaze it and evaluate the output ''' positiveEvalCounter = 0 with open(testFilePath, u'r', encoding=u'utf8') as gsFile: #get total number of comments totalComments = utilsOs.countLines(gsFile) - 1 #create an empty file for the norm if normOutPath != None: normFile = utilsOs.createEmptyFile( normOutPath, headerLine= u'Id\tEvaluation\tErrorTokens\tOriginal\tOutput\tGoldStandard') #create a separate folder for each column origFile = utilsOs.createEmptyFile(normOutPath.replace( u'.tsv', u'1Orig.tsv'), headerLine=u'Id\tOriginal') outFile = utilsOs.createEmptyFile(normOutPath.replace( u'.tsv', u'2Out.tsv'), headerLine=u'Id\tEvaluation\tOutput') goldFile = utilsOs.createEmptyFile(normOutPath.replace( u'.tsv', u'3Gold.tsv'), headerLine=u'Id\tGoldStandard') with open(testFilePath, u'r', encoding=u'utf8') as gsFile: #dispose of the header line header = gsFile.readline() #get first line line = gsFile.readline() #start an empty dejavuDict dejavuDict = {} #count and populate the norm while line: #get data lineList = (line.replace(u'\n', u'')).split(u'\t') commentId, originalComment, goldStandard = lineList normOutput = str(originalComment) #detect french feminin accord and fossilize the word by modifying its structure to something unchanged by the normalization function normOutput = frenchFemininAccordsCodification(originalComment, isInput=True) #apply orora solution to abbreviations if useAbbrDict != False: if useAbbrDict != True: normOutput = ororaZeAbbreviations(normOutput, useAbbrDict) else: normOutput = ororaZeAbbreviations(normOutput) #apply the normalization function if normalizationFunction != None: normOutput, dejavuDict = normalizationFunction( normOutput.lower(), dejavuDict, *args) #reverse back the code for the feminin accord into its original form normOutput = frenchFemininAccordsCodification(normOutput, isInput=False) #get normalized output if ororazeOutput == True: normOutput = ororaZe(normOutput, advanced=True) elif type(ororazeOutput) is tuple or type(ororazeOutput) is list: if ororazeOutput[0] == True: normOutput = ororaZe(normOutput, advanced=ororazeOutput[1]) #evaluation if the normalized output corresponds to the gold standard positiveEvalCounter, evaluation = normalizationEvaluator( normOutput, goldStandard, positiveEvalCounter) #get the tokens that do not correspond exactly and their edit distance errorTokList = utilsString.getcorrespondingTokensAndEditDist( normOutput, goldStandard) if evaluation == 0 else u'na' #dump if normOutPath != None: normFile.write(u'{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n'.format( commentId, evaluation, errorTokList, originalComment, normOutput, goldStandard)) #dump to column separate files origFile.write(u'{0}\t{1}\n'.format(commentId, originalComment)) outFile.write(u'{0}\t{1}\t{2}\t{3}\n'.format( commentId, evaluation, errorTokList, normOutput)) goldFile.write(u'{0}\t{1}\n'.format(commentId, goldStandard)) #next line line = gsFile.readline() #close the norm file if normOutPath != None: normFile.close() #close the other files origFile.close() outFile.close() goldFile.close() #dump the results resultsPath = u'{0}.results'.format(normOutPath.replace(u'.tsv', u'')) utilsOs.dumpRawLines([ u'NORMALIZATION RESULTS', u'exact positives: {0}/{1}'.format( positiveEvalCounter, totalComments), u'ratio: {0}'.format( float(positiveEvalCounter) / float(totalComments)) ], resultsPath) return { u'exact positives': positiveEvalCounter, u'total comments': totalComments, u'ratio': (float(positiveEvalCounter) / float(totalComments)) }