Exemple #1
0
def applyLearnedDictPlusHumanDict(
    testOrigPath,
    normOutPath=None,
    learnedDictPath=u'./005learnedDict/ororaAbbreviationDict.json',
    humanDictPath=u'./005learnedDict/humanMadeDict/humanMadeOroraAbbreviationDict.json'
):
    ''' apply the normalization dict'''
    #open the dicts
    learnedDict = myUtils.openJsonFileAsDict(learnedDictPath)
    humanDict = myUtils.openJsonFileAsDict(humanDictPath)
    #open the test dataframe from the path
    testOrigDf = myUtils.getDataFrameFromArgs(testOrigPath, header=False)[0]
    for index, testComment in testOrigDf.iteritems():
        #use the human dict FIRST (priority to the human-made dicts)
        normOutput = ororaZeAbbreviations(testComment,
                                          learnedDict,
                                          listTheVariations=False)
        #use the learned dict
        normOutput = ororaZeAbbreviations(normOutput,
                                          humanDict,
                                          listTheVariations=False)
        #save into pandas series
        testOrigDf[index] = normOutput
    #dump normalized output
    if normOutPath != None:
        testOrigDf.to_csv(normOutPath, sep=u'\t', index=False)
    return testOrigDf
def dumpDictIntersectAutoAndHuman(autoDict, humanDict=None, outputPath=None):
    ''' given 2 dicts makes the intersection of the 2 (keys in common in both of them), then returns and dumps it '''
    newDict = {}
    nonIntersectAutoDict = {}
    #get the dicts if they are not given directly as dicts
    if humanDict == None:
        humanDict = myUtils.openJsonFileAsDict(
            u'./005learnedDict/humanMadeDict/humanMadeOroraAbbreviationDict.json'
        )
    elif type(humanDict) is str:
        humanDict = myUtils.openJsonFileAsDict(humanDict)
    if type(autoDict) is str:
        autoDict = myUtils.openJsonFileAsDict(autoDict)
    # get the intersection of keys in both dicts
    intersection = set(autoDict.keys()) & set(humanDict.keys())
    for key in intersection:
        newDict[key] = autoDict[key]
    # get the auto keys not appearing in the intersection
    for key, val in autoDict.items():
        if key not in intersection:
            nonIntersectAutoDict[key] = val
    #dump it
    if outputPath == None:
        outputPath = u'./005learnedDict/intersectionHumanAutoDict/humanAutoDict.json'
    myUtils.dumpDictToJsonFile(newDict,
                               pathOutputFile=outputPath,
                               overwrite=True)
    myUtils.dumpDictToJsonFile(nonIntersectAutoDict,
                               pathOutputFile=outputPath.replace(
                                   u'HumanAutoDict/',
                                   u'HumanAutoDict/nonIntersect/').replace(
                                       u'.json', u'NonIntersect.json'),
                               overwrite=True)
    return newDict, nonIntersectAutoDict
def applyNormalisation(testOrigPath,
                       normOutPath=None,
                       normalization=None,
                       *args):
    ''' apply the normalization dict'''
    #if we are given a path to the place where the dict is
    if type(normalization) is str:
        normalization = myUtils.openJsonFileAsDict(normalization)
    #start an empty dejavuDict
    dejavuDict = {}
    c = 0
    #open the test dataframe from the path
    testOrigDf = myUtils.getDataFrameFromArgs(testOrigPath, header=False)[0]
    for index, testComment in testOrigDf.iteritems():
        if normalization == None:
            normOutput = testComment
        #use the dict as a normalization
        elif type(normalization) is dict:
            normOutput, c = ororaZeAbbreviations(testComment,
                                                 normalization,
                                                 listTheVariations=True,
                                                 c=c)
        else:
            #detect french feminin accord and fossilize the word by modifying its structure to something unchanged by the normalization function
            normOutput = frenchFemininAccordsCodification(originalComment,
                                                          isInput=True)
            #apply the spell corrector or other normalization function
            normOutput, dejavuDict = normalizationFunction(
                normOutput.lower(), dejavuDict, *args)
            #reverse back the code for the feminin accord into its original form
            normOutput = frenchFemininAccordsCodification(normOutput,
                                                          isInput=False)
        #save into pandas series
        testOrigDf[index] = normOutput
    #dump normalized output
    if normOutPath != None:
        testOrigDf.to_csv(normOutPath, sep=u'\t', index=False)
    print(22222, c, len(testOrigDf), c / len(testOrigDf))
    return testOrigDf
def ororaZeAbbreviations(string,
                         abbrDict=None,
                         listTheVariations=False,
                         c=0,
                         e=0):
    ''' 
	ABBR --> ABBREVIATION
	'''
    def makeReplacements(
        token
    ):  #replace diacritical characters with non diacritical characters
        replacements = [(u'A', u'ÀÂ'), (u'E', u'ÉÈÊ'), (u'I', u'ÎÏ'),
                        (u'O', u'Ô'), (u'U', u'ÙÛÜ'), (u'C', u'Ç')]
        for replaceTuple in replacements:
            for char in replaceTuple[1]:
                token = token.replace(char, replaceTuple[0])
                token = token.replace(char.lower(), replaceTuple[0].lower())
        return token

    #open the abbreviation dict
    if abbrDict == None:
        abbrDict = myUtils.openJsonFileAsDict(
            u'./005learnedDict/ororaAbbreviationDict.json')
    #open the abbr dict file if it's a path
    elif type(abbrDict) is str:
        abbrDict = myUtils.openJsonFileAsDict(abbrDict)
    #abbreviation replacement
    stringList = string.split(u' ')
    if type(abbrDict[list(abbrDict.keys())[0]]) is list:
        for index, token in enumerate(stringList):
            #if the token is in the dict
            if makeReplacements(token).upper() in abbrDict:
                minScore = 0.55
                #if we search only for the first and most common option
                if listTheVariations == False:
                    #if the token has a reliable score
                    if abbrDict[makeReplacements(
                            token).upper()][0][1] >= minScore:
                        stringList[index] = abbrDict[makeReplacements(
                            token).upper()][0][0]
                #if we want to return a list of all the possibilities in decreasing order
                else:
                    for var in abbrDict[makeReplacements(token).upper()]:
                        if var[1] >= minScore:
                            e = 1
                    variations = [
                        var[0]
                        for var in abbrDict[makeReplacements(token).upper()]
                        if var[1] >= minScore
                    ]
                    stringList[index] = u'¤'.join(variations) if len(
                        variations) != 0 else makeReplacements(token).upper()
        if e == 1:
            c += 1
        #stringList = [ token if makeReplacements(token).upper() not in abbrDict else abbrDict[makeReplacements(token).upper()][0][0] for token in stringList ]
    else:
        stringList = [
            token if makeReplacements(token).upper() not in abbrDict else
            abbrDict[makeReplacements(token).upper()] for token in stringList
        ]
    #elimination of the empty elements u'' or u'∅' if they got in (somehow)
    #stringList = [ token for token in stringList if token not in [u'', u'∅'] ]
    string = u' '.join(stringList)
    return string, c
def makeDictFromTsvTrain(pathNonMatch, trainedDict={}, outputDictFilePath=False):
	'''	'''
	#open the dict
	if type(trainedDict) == str:
		trainedDict = myUtils.openJsonFileAsDict(trainedDict)
	#open as a list the non identical elements between the original and the gold
	if type(pathNonMatch) is str:
		with open(pathNonMatch) as nonMatchFile:
			nonMatchList = []
			line = nonMatchFile.readline()
			while line:
				nonMatchList.append(ast.literal_eval(line.replace(u', \n', u'').replace(u',\n', u'').replace(u'\n', u'')))
				line = nonMatchFile.readline()
	#get the gold standard data to which compare the training data
	for nonMatchingAlignment in nonMatchList:
		#if the list is not empty
		if nonMatchingAlignment:
			for nonMatchTupl in nonMatchingAlignment:
				#use the original token as a key
				trainedDict[nonMatchTupl[0]] = trainedDict.get( nonMatchTupl[0], list() )+[nonMatchTupl[1]]
	#clean the dict
	for origKey, goldValList in dict(trainedDict).items():
		#eliminate all the elements in the dict that have an empty symbol as a value
		if set(goldValList) == {u'∅'}:
			del trainedDict[origKey]
		#########elif len(goldValList) != 1:
		#########	del trainedDict[origKey]
		#eliminate the elements containing a number character
		elif myUtils.detectNbChar(origKey) == True:
			del trainedDict[origKey]
		else:
			#change the goldValList into a sorted list with a count of the recurrences
			goldValSortedList = []
			#eliminate the empty symbol from the list
			goldValList = [ elem for elem in goldValList if elem not in [u'∅', u''] ]
			for goldVal in set(goldValList):
				#count their instances
				counter = 0
				for gv in goldValList:
					if goldVal == gv:
						counter += 1
				#add the token and the normalized score
				goldValSortedList.append( (goldVal, float(counter)/float(len(goldValList))) )
			#sort the list
			goldValSortedList.sort(reverse=True, key=lambda x: x[1])
			trainedDict[origKey] = goldValSortedList 
		
		
		'''
		#eliminate all the elements in the dict that have multiple possible outputs or if the value is an empty symbol
		if len(goldValList) != 1 or set(goldValList) == {u'∅'}:
			del trainedDict[origKey]
		#eleiminate the elements containing a number character
		elif myUtils.detectNbChar(origKey) == True:
			del trainedDict[origKey]
		else:
			trainedDict[origKey] = goldValList[0]
		'''
		'''
		#eliminate all the elements in the dict that have multiple possible outputs or if the value is an empty symbol
		if len(goldValList) != 1 or set(goldValList) == {u'∅'}:
			del trainedDict[origKey]
		#eleiminate the elements containing a number character
		elif myUtils.detectNbChar(origKey) == True:
			del trainedDict[origKey]
		else:
			trainedDict[origKey] = goldValList[0]
		'''
	print(len(trainedDict))
	#dump the dict
	if outputDictFilePath != False:
		myUtils.dumpDictToJsonFile(trainedDict, outputDictFilePath, overwrite=True)
	return trainedDict
def makeDictFromTsvTrain(pathNonMatchList,
                         existingDict={},
                         pathMatch=None,
                         pathNonMatch=None,
                         language=u'fr',
                         outputDictFilePath=False):
    '''	'''
    #open the trained dict
    if type(existingDict) == str:
        trainedDict = myUtils.openJsonFileAsDict(trainedDict)
    else:
        trainedDict = dict(existingDict)
    #open the matching dicts
    if pathMatch != None:
        matchCounterDict = myUtils.openJsonFileAsDict(pathMatch)
        nonMatchCounterDict = myUtils.openJsonFileAsDict(pathNonMatch)
    #open as a list the non identical elements between the original and the gold
    if type(pathNonMatchList) is str:
        with open(pathNonMatchList) as nonMatchFile:
            nonMatchList = []
            line = nonMatchFile.readline()
            while line:
                nonMatchList.append(
                    ast.literal_eval(
                        line.replace(u', \n',
                                     u'').replace(u',\n',
                                                  u'').replace(u'\n', u'')))
                line = nonMatchFile.readline()
    #get the gold standard data to which compare the training data
    for nonMatchingAlignment in nonMatchList:
        #if the list is not empty
        if nonMatchingAlignment:
            for nonMatchTupl in nonMatchingAlignment:
                #use the original token as a key
                trainedDict[nonMatchTupl[0]] = trainedDict.get(
                    nonMatchTupl[0], list()) + [nonMatchTupl[1]]
    #first cleaning: eliminate the elements that appear a lot less than the unchanged variant
    if pathMatch != None:
        for origKey, goldValList in dict(trainedDict).items():
            try:
                nbMatch = matchCounterDict[origKey]
                nbNonMatch = 0
                for indexGold, goldVal in enumerate(goldValList):
                    nbNonMatch = nonMatchCounterDict[origKey][goldVal]
                    ###trainedDict[origKey] = goldValList #don't remove anything, no matter how uncommon
                    #remove the gold value if it's not a very common unmatching replacement
                    if float(nbMatch) / float(nbMatch + nbNonMatch) >= 0.55:
                        del goldValList[indexGold]
                        #if the list is empty delete it
                        if len(goldValList) == 0:
                            del trainedDict[origKey]
                        else:
                            trainedDict[origKey] = goldValList
                    elif nbNonMatch < 10:
                        del goldValList[indexGold]
                        #if the list is empty delete it
                        if len(goldValList) == 0:
                            del trainedDict[origKey]
            except KeyError:
                pass
    #second cleaning, get rid of the empty element
    if u'' in trainedDict: del trainedDict[u'']
    #third cleaning, eliminate the elements whose value is way smaller than its key and
    #of the entries whose key have empirically shown to induce errors and delete them
    for origKey, goldValList in dict(trainedDict).items():
        # if the key is undesireable
        if undesireableKey(origKey) == True:
            del trainedDict[origKey]
        else:
            indexesToDel = []
            for indexVal, goldVal in enumerate(list(goldValList)):
                # if the val is much smaller than the key
                if float(len(goldVal)) / float(len(origKey)) <= 0.4:
                    indexesToDel.append(indexVal)
                # if the val is undesireable
                elif undesireableVal(origKey, goldVal) == True:
                    indexesToDel.append(indexVal)
            #we delete the indexes we detected as causing problems
            for indexVal in reversed(indexesToDel):
                del goldValList[indexVal]
            #if we have not produced an empty list, assign it to the trained dict
            if len(goldValList) == 0:
                del trainedDict[origKey]
            else:
                trainedDict[origKey] = goldValList
    #fourth cleaning, ruled based cleaning
    for origKey, goldValList in dict(trainedDict).items():
        #eliminate all the elements in the dict that have an empty symbol as a value
        if set(goldValList) == {u'∅'} or origKey == u'∅':
            del trainedDict[origKey]
        #eliminate all ambiguous entries
        ###elif len(goldValList) != 1:
        ###	del trainedDict[origKey]
        #eliminate the elements containing a number character
        elif myUtils.detectNbChar(origKey) == True:
            del trainedDict[origKey]
        #eliminate the elements whose key is a stop-word
        elif myUtils.isTokenStopWord(origKey, language) == True:
            del trainedDict[origKey]
        else:
            #change the goldValList into a sorted list with a count of the recurrences
            goldValSortedList = []
            #eliminate the empty symbol from the list
            goldValList = [
                elem for elem in goldValList if elem not in [u'∅', u'']
            ]
            for goldVal in set(goldValList):
                #count their instances
                counter = 0
                for gv in goldValList:
                    if goldVal == gv:
                        counter += 1
                #add the token and the normalized score
                goldValSortedList.append(
                    (goldVal, float(counter) / float(len(goldValList))))
            #sort the list and add to the dict
            if len(goldValSortedList) != 0:
                goldValSortedList.sort(reverse=True, key=lambda x: x[1])
                trainedDict[origKey] = goldValSortedList
    #dump the dict
    if outputDictFilePath != False:
        myUtils.dumpDictToJsonFile(trainedDict,
                                   outputDictFilePath,
                                   overwrite=True)
    return trainedDict