Beispiel #1
0
def makeARefFile(rootFolder=u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/",
                 refFilePath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"):
    # make sure the files does not yet exists
    if utilsOs.theFileExists(refFilePath) is True:
        return None
    utilsOs.createEmptyFile(refFilePath)
    listOfFiles = utilsOs.goDeepGetFiles(u"/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/", format=u".tmx.en")
    with open(refFilePath, u"a") as refFile:
        for filePath in listOfFiles:
            refFile.write(u"{0}\t-1\n".format(filePath.replace(u".tmx.en", u".tmx")))
Beispiel #2
0
def dumpJobTitleAndDescription(jsonDict,
                               pathOutputFile='./job+pitch.tsv',
                               addJobDescription=False):
    '''
	Saves the basic linkedIn job data in a tsv file
	each line shows:
	- one job title (or variant)
	- its description(s) / personal pitch(es) / NA (non applicable)
	'''
    #file of job titles names (and optional description)
    with utilsOs.createEmptyFile(pathOutputFile) as outputJobtitles:
        for jobTitle, jobData in jsonDict.items():
            #only the job title
            if addJobDescription == False:
                content = u'%s\n' % (jobTitle)
            #job title + description
            else:
                #if there is one or multiple specific description of the job
                if u'description' in jobData and len(
                        jobData[u'description']) > 0:
                    content = u'%s\t%s\n' % (jobTitle, u' _#####_ '.join(
                        jobData[u'description']))
                else:
                    #if there is one or multiple personal pitches that might give us an idea of what is the job
                    if u'pitch' in jobData and len(jobData[u'pitch']) > 0:
                        content = u'%s\t%s\n' % (jobTitle, u' _#####_ '.join(
                            jobData[u'pitch']))
                    #if there is nothing then it's Non Applicable
                    else:
                        content = u'%s\t%s\n' % (jobTitle, u'NA')
            #dumping to file
            outputJobtitles.write(content)
    return
Beispiel #3
0
def edgeListTemp(pathInput, pathTempFile, lowercaseItAll=False):
	'''
	takes the linkedin data and makes a temporary file that is an edge list of (columns):
		- jobNode(source)
		- skillNode(target)
	It's only a temporary file because we still need to erase doubles, 
	to make the weight (count coreference of skills) and count how 
	many times the job titles appeared
	'''
	#we open a temp file
	outputTempFile = utilsOs.createEmptyFile(pathTempFile) #don't specify that the headerLine is 'Source \t Target'

	with open(pathInput) as jsonFile:
		#we read the original json file line by line
		jsonData = jsonFile.readline()
		while jsonData:
			jsonDict = json.loads(jsonData)
			#if there are experiences
			if u'experiences' in jsonDict:
				#reliable job-skill correspondence if we only have one job title
				if len(jsonDict[u'experiences']) == 1:
					if u'function' in jsonDict[u'experiences'][0]:
						jobTitle = jsonDict[u'experiences'][0][u'function']
						if lowercaseItAll != False:
							jobtitle = jobTitle.lower()
						if u'skills' in jsonDict:
							for skillDict in jsonDict[u'skills']:
								skill = skillDict[u'name']
								if lowercaseItAll != False:
									skill = skill.lower()
								outputTempFile.write(u'{0}\t{1}\n'.format(jobTitle, skill))
								#outputTxt.write(u'{0}\t{1}\n'.format(jobTitle, skill))
			jsonData = jsonFile.readline()
	#closing the file	
	outputTempFile.close()
Beispiel #4
0
def nodeListIdType(pathEdgeListFile, pathNodeFileOutput):
	'''
	opens the temp file containing the extracted linkedin data and makes a node list of (columns):
		- id(same as label)
		- label(jobTitle / skill node)	
		- type(source or target; 2 for source 1 for target) #the job title is always the source, the skill is always the target
	'''
	jobTitleSet = set()
	skillSet = set()

	#open the output file
	outputTxt = utilsOs.createEmptyFile(pathNodeFileOutput, headerLine=u'Id\tLabel\tNodeType')

	with codecs.open(pathEdgeListFile, u'r', encoding=u'utf8') as edgeData:
		dataLine = edgeData.readline()
		while dataLine:
			dataList = dataLine.replace(u'\n', u'').split(u'\t')
			if len(dataList) > 1:
				#add to the jobTitle (source) set
				jobTitleSet.add(dataList[0])
				#add to the skill (target) set
				skillSet.add(dataList[1])
			#get to the next line
			dataLine = edgeData.readline()
	#browse the data sets to dump them
	for jobTitle in jobTitleSet:
		outputTxt.write(u'{0}\t{1}\t{2}\n'.format(jobTitle, jobTitle.replace(u'__s', u''), 2)) #id's '_s' means 'source', 2 means 'source'
	for skill in skillSet:
		outputTxt.write(u'{0}\t{1}\t{2}\n'.format(skill, skill.replace(u'__t', u''), 1)) #id's '_t' means 'target', 1 means 'target'
Beispiel #5
0
def makeJobSetFromLinkedIn(pathInput, lowercaseItAll=False, pathOutput=None, n=float('inf')):
	'''
	makes a set of jobs taken from linkedIn profiles 
		IF a "pathOutput" is given we save the set as a txt file
		of one job per line 
		IF an "n" argument is given it makes a samble containing how
		many profiles necesary to achieve N functions (jobtitles)
	'''
	jobSet = set()
	#open the file
	if pathOutput != None:
		outputTxt = utilsOs.createEmptyFile(u'%slistOfJobs.linkedIn' %(pathOutput))
	#we read the original json file line by line
	with open(pathInput) as jsonFile:
		#if we want to make a sample n must be an int otherwise it will keep going until eof
		jsonData = jsonFile.readline()
		while jsonData and len(jobSet) < n:
			jobDict = getJobData(json.loads(jsonData))
			#we add the job titles
			jobsToAdd = set(jobDict.keys())
			#if we want them lowercased
			if lowercaseItAll != False:
				jobsToAdd = set([e.lower() for e in jobsToAdd])
			#we dump each job title into the txt file
			if pathOutput != None:
				for jobTitle in jobsToAdd:
					outputTxt.write(u'{0}\n'.format(jobTitle))
			#adding to the set
			jobSet = jobSet.union(jobsToAdd)
			#nextLine
			jsonData = jsonFile.readline()
	#closing the file	
	if pathOutput != None:
		outputTxt.close()
	return jobSet
Beispiel #6
0
def makeSampleFileHavingNJobTitles(pathInput, pathOutput, n=1000000, addJobDescription=False):
	'''
	takes the real linkedIn data and makes a sample containing how
	many profiles necesary to achieve N functions (jobtitles)
	'''
	dictJobTitlesData = {}
	#sample of all candidates data
	outputJson = utilsOs.createEmptyFile(u'%ssample.json' %(pathOutput))
	
	#we read the original json file line by line
	with codecs.open(pathInput, u'r', encoding=u'utf8') as jsonFile:
		while len(dictJobTitlesData) < n:
			jsonData = jsonFile.readline()
			#we dump each line into the sample file
			outputJson.write(jsonData.replace(u'\r', ''))
			#we make a dict out of the string line
			jsonLine = utilsOs.convertJsonLineToDict(jsonData)
			if jsonLine != None:
				#we dump each job title into the jobtitle file
				dictJobTitlesData = getJobData(jsonLine, dictJobTitlesData)

	#dumping dict content in json
	utilsOs.dumpDictToJsonFile(dictJobTitlesData, pathOutputFile=u'%sjobTitlesDataDict.json'%(pathOutput))

	#SIMPLIFIED DATA dumping job title (and optional dexcription) to a file
	dumpJobTitleAndDescription(dictJobTitlesData, u'%sjob+pitch.tsv'%(pathOutput), addJobDescription)
	
	#closing the files
	outputJson.close()
	return None
Beispiel #7
0
def getANewSpWhereWeLeftOff(refPath=u"/data/rali5/Tmp/alfonsda/workRali/004tradBureau/017deepLTranslatedCorpus/trRef"):
    # check if the ref file already exists
    if utilsOs.theFileExists(refPath) is False:
        utilsOs.createEmptyFile(refPath)
    # open the reference file
    lastSeenIndex, lastSeenPath = None, None
    with open(refPath) as ref:
        # first line
        refLns = ref.readlines()
        refIndex = 0
        for refLn in refLns:
            refList = refLn.replace(u"\n", u"").split(u"\t")
            # test if we have an index for the path
            try:
                lastSeenIndex = int(refList[1])
                lastSeenPath = refList[0]
                break
            # if there is no integral, then it saw all lns for that path
            except ValueError:
                pass
            # next ref index
            refIndex += 1
    # open the last seen file at the (last seen index + 1) and return the sp in the en and fr files
    if lastSeenIndex is None:
        return None
    with open(u"{0}.en".format(lastSeenPath)) as enFile:
        with open(u"{0}.fr".format(lastSeenPath)) as frFile:
            enLn = enFile.readline()
            frLn = frFile.readline()
            indexLn = 0
            while enLn:
                if indexLn == lastSeenIndex+1:
                    # replace the line with its next index and dump the ref file
                    refLns[refIndex] = u"{0}\t{1}\n".format(lastSeenPath, indexLn)
                    # return the sentence pair
                    return [enLn.replace(u"\n", u""), frLn.replace(u"\n", u"")], lastSeenPath, indexLn, refLns
                # next line
                enLn = enFile.readline()
                frLn = frFile.readline()
                indexLn += 1
    # if we went over the whole document and it ended, change the ref line, dump it and start over
    refLns[refIndex] = u"{0}\tdone\n".format(lastSeenPath)
    utilsOs.dumpRawLines(refLns, refPath, addNewline=False, rewrite=True)
    return getANewSpWhereWeLeftOff(refPath)
def makeSimpleTokenDatasetFromTsv(tsvInputFilePath,
                                  originalStringColumnName,
                                  correctStringColumnName,
                                  outputFilePath,
                                  outputOriginalColumnName=u'input',
                                  outputCorrectColumnName=u'output',
                                  caseSensitive=True):
    ''' takes a tsv file, naively space-char tokenizes the content in the original 
	and correct columns, makes correspond the original and correct tokens and dumps 
	each token in a row of an output tsv file '''
    total = 0
    nonErrors = 0
    #create empty output file
    outputFile = utilsOs.createEmptyFile(
        outputFilePath, u'{0}\t{1}\n'.format(outputOriginalColumnName,
                                             outputCorrectColumnName))
    #browse the input file line by line
    with open(tsvInputFilePath, u'r', encoding=u'utf8') as inputFile:
        #header line
        headerList = (inputFile.readline().replace(u'\n', u'')).split(u'\t')
        #find the column indexes corresponding to the column names
        originalIndex = headerList.index(
            originalStringColumnName
        ) if originalStringColumnName in headerList else 0
        correctIndex = headerList.index(
            correctStringColumnName
        ) if correctStringColumnName in headerList else 1
        #first line
        line = inputFile.readline()
        while line:
            #case sensibility
            if caseSensitive != True:
                line = line.lower()
            #get the list of elements in the line
            lineList = (line.replace(u'\n', u'')).split(u'\t')
            #get the tokens in the original string (in this case: tokens = space char separated elements)
            originalTokens = lineList[originalIndex].split(u' ')
            #get the erratic correspondences between the original string and the correct string
            errorTokens = utilsString.getcorrespondingTokensAndEditDist(
                lineList[originalIndex], lineList[correctIndex], caseSensitive)
            for origToken in originalTokens:
                #write the non problematic tokens
                if origToken not in [tupl[0] for tupl in errorTokens]:
                    outputFile.write(u'{0}\t{1}\n'.format(
                        origToken, origToken))
                    nonErrors += 1
                #write the problematic ones
                for tupl in errorTokens:
                    outputFile.write(u'{0}\t{1}\n'.format(tupl[0], tupl[1]))
                total += 1
            line = inputFile.readline()
    print(nonErrors, total, nonErrors / total)
    return nonErrors, total
Beispiel #9
0
def makeGoldStandardOrora(inputPath, outputPath, goldStandardPath):
	'''
	opens the input and output file and makes one input with corresponding output file
	'''
	#INCLUDING the information block columns
	###headerLine = u'Id\tCommentIn\tCommentOut\tInformation blocks\tCoded information block\tBlock type'
	#EXCLUSING the information block columns
	headerLine = u'Id\tCommentIn\tCommentOut'
	#create empty gold standard file
	gsFile = utilsOs.createEmptyFile(goldStandardPath, headerLine)
	#browse the output file line by line
	with open(outputPath, u'r', encoding=u'utf8') as outFile:
		#header line
		headerList = (outFile.readline().replace(u'\n', u'')).split(u'\t')
		idCodeColName, commentColName = headerList[1], headerList[2]
		#dataframe
		inputDf = utilsGraph.getDataFrameFromArgs(inputPath)
		line = outFile.readline()
		#populate edge list
		while line:
			#get data
			lineList = (line.replace(u'\n', u'')).split(u'\t')
			#we replace any tabulation in the comments with ' ___ ' so there are no inconsistencies with the tsv (tab based)
			idCode, commentOutLabel, theRest = lineList[1], lineList[2].replace(u'\t', u' ___ '), lineList[3:]
			#select
			selectInputDf = inputDf.loc[ inputDf[u'Id'] == int(idCode) ]
			#get the input comment
			commentInLabel = ( selectInputDf.loc[ selectInputDf[idCodeColName] == int(idCode) ] )[u'Comment'].tolist()
			if len(commentInLabel) == 1:
				commentInLabel = commentInLabel[0].replace(u'\t', u' ___ ')
			else:
				print('there are multiple rows with the same ID code', idCode, commentInLabel)
			#write the line INCLUDING the information block columns
			###gsFile.write( u'{0}\t{1}\t{2}\t{3}\n'.format(idCode, commentInLabel, commentOutLabel, u'\t'.join(theRest)) )
			#write the line EXCLUDING the information block columns
			gsFile.write( u'{0}\t{1}\t{2}\n'.format(idCode, commentInLabel, commentOutLabel) )
			#next line
			line = outFile.readline()
	#close the file
	gsFile.close()
	#remove the row doubles
	gsDf = utilsGraph.getDataFrameFromArgs(goldStandardPath)
	gsDf = gsDf.drop_duplicates()
	gsDf.to_csv(goldStandardPath, sep='\t', index=False)
	return gsDf
Beispiel #10
0
def edgeListDump(pathTempFile, pathOutput):
	'''
	opens the temp file containing the extracted linkedin data and makes an edge list of (columns):
		- jobNode(source)
		- skillNode(target)	
		- weight(coreference) 
		- nbOfTimesJobTitleAppeared
	
	[in a further function we might want to add keywords (non stop-words most common tokens for each jobtitle)]
	'''
	skillCorefDict = {}
	jobTitleCorefDict = {}
	lastJobTitle = None
	lineSet = set()

	#open the output file
	outputTxt = utilsOs.createEmptyFile(pathOutput, headerLine=u'Source\tTarget\tWeight\tWeight1')
	#we browse the data once to get the weight and nbOfTimesJobTitleAppeared data

	with codecs.open(pathTempFile, u'r', encoding=u'utf8') as tempData:
		dataLine = tempData.readline()
		while dataLine:
			dataList = dataLine.replace(u'\n', u'').split(u'\t')
			if len(dataList) > 1:
				#count the skills coref
				skillCorefDict[dataList[1]] = skillCorefDict.get(dataList[1], 0) + 1
				#count the repetitions of job titles
				if dataList[0] != lastJobTitle:
					jobTitleCorefDict[dataList[0]] = jobTitleCorefDict.get(dataList[0], 0) + 1
					lastJobTitle = dataList[0]
				#we add the line to the set
				lineSet.add(dataLine)
			#get to the next line
			dataLine = tempData.readline()
	#we browse the data a second time to dump it
	for dataLine in lineSet:
		dataList = dataLine.replace(u'\n', u'').split(u'\t')
		#we write 2 possible edge weights: skill coreference & skill coreference*jobtitle coreference
		outputTxt.write(u'{0}__s\t{1}__t\t{2}\t{3}\n'.format(dataList[0], dataList[1], skillCorefDict[dataList[1]], skillCorefDict[dataList[1]]*jobTitleCorefDict[dataList[0]]))

	#closing the file	
	outputTxt.close()
Beispiel #11
0
def reformatFilesPreGiza(pathToEnFile, pathToFrFile, overwrite=True):
    """
    make 2 vocabulary files (occurrence dict) in the format needed by giza++ or mgiza++
    then reformats the corpus into a the format needed by giza++ or mgiza++
    :param pathToEnFile: path to the english sentences file
    :param pathToFrFile: path to the french sentences file
    :return: None
    """
    # prepare the output paths
    outputEnPath = prepareOutPutFile(pathToEnFile, fileName=u"sourceEn.vcb")
    outputFrPath = prepareOutPutFile(pathToFrFile, fileName=u"targetFr.vcb")
    outputPathGizaFormatCorpus = prepareOutPutFile(
        pathToEnFile, fileName=u"sentenceFile.giza")
    outputEnDictPath = prepareOutPutFile(pathToEnFile, fileName=u"en.json")
    outputFrDictPath = prepareOutPutFile(pathToEnFile, fileName=u"fr.json")
    outputSpDictPath = prepareOutPutFile(pathToEnFile, fileName=u"sp.json")
    outputEnIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"enId.json")
    outputFrIdDictPath = prepareOutPutFile(pathToEnFile, fileName=u"frId.json")
    # if there is not a file there yet, open the corpus Files, count the frequency of each token
    if overwrite is True or os.path.isfile(outputEnDictPath) is False:
        # make the frequency dict
        enTokFreqDict = makeFreqDict(pathToEnFile, lang=u"en")
        frTokFreqDict = makeFreqDict(pathToFrFile, lang=u"fr")
        # open the corpus files count the frequency of the sentence pairs
        spFreqDict = makeSPfreqDict(pathToEnFile, pathToFrFile)
        # sort the dict by freq
        orderedKeysValuesEn = sorted(enTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        orderedKeysValuesFr = sorted(frTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        # make the id dict
        enIdDict = makeIdDict(orderedKeysValuesEn)
        frIdDict = makeIdDict(orderedKeysValuesFr)
        # dump dicts
        utilsOs.dumpDictToJsonFile(enTokFreqDict, outputEnDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(frTokFreqDict, outputFrDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(spFreqDict, outputSpDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(enIdDict, outputEnIdDictPath, overwrite)
        utilsOs.dumpDictToJsonFile(frIdDict, outputFrIdDictPath, overwrite)
    # if the file already exists or if overwrite is false
    else:
        enTokFreqDict = utilsOs.openJsonFileAsDict(outputEnDictPath)
        frTokFreqDict = utilsOs.openJsonFileAsDict(outputFrDictPath)
        spFreqDict = utilsOs.openJsonFileAsDict(outputSpDictPath)
        enIdDict = utilsOs.openJsonFileAsDict(outputEnIdDictPath)
        frIdDict = utilsOs.openJsonFileAsDict(outputFrIdDictPath)
        # sort the dict by freq
        orderedKeysValuesEn = sorted(enTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
        orderedKeysValuesFr = sorted(frTokFreqDict.items(),
                                     key=lambda kv: (kv[1], kv[0]),
                                     reverse=True)
    # dump the empty tok voc file
    if overwrite is True:
        firstLine = u"1\tUNK\t0"
        utilsOs.createEmptyFile(outputEnPath, headerLine=firstLine)
        utilsOs.createEmptyFile(outputFrPath, headerLine=firstLine)
        utilsOs.createEmptyFile(outputPathGizaFormatCorpus)
    # dump the dict in the tok voc file
    for indKv, kv in enumerate(orderedKeysValuesEn):
        stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1])
        utilsOs.appendLineToFile(stringLine, outputEnPath, addNewLine=True)
    for indKv, kv in enumerate(orderedKeysValuesFr):
        stringLine = u"{0}\t{1}\t{2}".format(indKv + 2, kv[0], kv[1])
        utilsOs.appendLineToFile(stringLine, outputFrPath, addNewLine=True)
    # transform and dump the corpus into the GIZA format
    appendToDumpInGizaFormat(pathToEnFile, pathToFrFile,
                             outputPathGizaFormatCorpus, enIdDict, frIdDict,
                             spFreqDict)
    return outputEnPath, outputFrPath, outputPathGizaFormatCorpus, outputEnDictPath, outputFrDictPath, outputSpDictPath
def applyNormalisationGetResult(testFilePath,
                                normOutPath=None,
                                ororazeOutput=(True, True),
                                useAbbrDict=False,
                                normalizationFunction=None,
                                *args):
    ''' 
	if normalizationFunction is none, then it will create the baseline otherwise 
	it will aplly the normalization function, ororaze it and evaluate the output 
	'''
    positiveEvalCounter = 0
    with open(testFilePath, u'r', encoding=u'utf8') as gsFile:
        #get total number of comments
        totalComments = utilsOs.countLines(gsFile) - 1
    #create an empty file for the norm
    if normOutPath != None:
        normFile = utilsOs.createEmptyFile(
            normOutPath,
            headerLine=
            u'Id\tEvaluation\tErrorTokens\tOriginal\tOutput\tGoldStandard')
        #create a separate folder for each column
        origFile = utilsOs.createEmptyFile(normOutPath.replace(
            u'.tsv', u'1Orig.tsv'),
                                           headerLine=u'Id\tOriginal')
        outFile = utilsOs.createEmptyFile(normOutPath.replace(
            u'.tsv', u'2Out.tsv'),
                                          headerLine=u'Id\tEvaluation\tOutput')
        goldFile = utilsOs.createEmptyFile(normOutPath.replace(
            u'.tsv', u'3Gold.tsv'),
                                           headerLine=u'Id\tGoldStandard')
    with open(testFilePath, u'r', encoding=u'utf8') as gsFile:
        #dispose of the header line
        header = gsFile.readline()
        #get first line
        line = gsFile.readline()
        #start an empty dejavuDict
        dejavuDict = {}
        #count and populate the norm
        while line:
            #get data
            lineList = (line.replace(u'\n', u'')).split(u'\t')
            commentId, originalComment, goldStandard = lineList
            normOutput = str(originalComment)
            #detect french feminin accord and fossilize the word by modifying its structure to something unchanged by the normalization function
            normOutput = frenchFemininAccordsCodification(originalComment,
                                                          isInput=True)
            #apply orora solution to abbreviations
            if useAbbrDict != False:
                if useAbbrDict != True:
                    normOutput = ororaZeAbbreviations(normOutput, useAbbrDict)
                else:
                    normOutput = ororaZeAbbreviations(normOutput)
            #apply the normalization function
            if normalizationFunction != None:
                normOutput, dejavuDict = normalizationFunction(
                    normOutput.lower(), dejavuDict, *args)
            #reverse back the code for the feminin accord into its original form
            normOutput = frenchFemininAccordsCodification(normOutput,
                                                          isInput=False)
            #get normalized output
            if ororazeOutput == True:
                normOutput = ororaZe(normOutput, advanced=True)
            elif type(ororazeOutput) is tuple or type(ororazeOutput) is list:
                if ororazeOutput[0] == True:
                    normOutput = ororaZe(normOutput, advanced=ororazeOutput[1])
            #evaluation    if the normalized output corresponds to the gold standard
            positiveEvalCounter, evaluation = normalizationEvaluator(
                normOutput, goldStandard, positiveEvalCounter)
            #get the tokens that do not correspond exactly and their edit distance
            errorTokList = utilsString.getcorrespondingTokensAndEditDist(
                normOutput, goldStandard) if evaluation == 0 else u'na'
            #dump
            if normOutPath != None:
                normFile.write(u'{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n'.format(
                    commentId, evaluation, errorTokList, originalComment,
                    normOutput, goldStandard))
                #dump to column separate files
                origFile.write(u'{0}\t{1}\n'.format(commentId,
                                                    originalComment))
                outFile.write(u'{0}\t{1}\t{2}\t{3}\n'.format(
                    commentId, evaluation, errorTokList, normOutput))
                goldFile.write(u'{0}\t{1}\n'.format(commentId, goldStandard))
            #next line
            line = gsFile.readline()
    #close the norm file
    if normOutPath != None:
        normFile.close()
        #close the other files
        origFile.close()
        outFile.close()
        goldFile.close()
        #dump the results
        resultsPath = u'{0}.results'.format(normOutPath.replace(u'.tsv', u''))
        utilsOs.dumpRawLines([
            u'NORMALIZATION RESULTS', u'exact positives: {0}/{1}'.format(
                positiveEvalCounter, totalComments), u'ratio: {0}'.format(
                    float(positiveEvalCounter) / float(totalComments))
        ], resultsPath)
    return {
        u'exact positives': positiveEvalCounter,
        u'total comments': totalComments,
        u'ratio': (float(positiveEvalCounter) / float(totalComments))
    }