def tableOfContents(stringSrc, stringTrgt, nTokens=4, contextScores=None, placeInDocument=None): """ given a string sentence pair return a score of the ratio of small sentence pairs in the context of the current sp """ # change the place in the doc to obtain low metric in the beginning and end of doc and a high one at the middle placeInDocument = math.sqrt(placeInDocument - (placeInDocument**2)) * 2 # if it's not already tokenized if type(stringSrc) is str and type(stringTrgt) is str: stringSrc, stringTrgt = stringSrc.lower(), stringTrgt.lower() addSeparators = [u'.', u',', u':', u'/', u'-', u"''", u"'"] stringSrc = utilsString.nltkTokenizer(stringSrc, addSeparators) stringTrgt = utilsString.nltkTokenizer(stringTrgt, addSeparators) scores = [tooFewTokens(stringSrc, stringTrgt, nTokens)] # re make the token list a string so we can check the first characters origSrcString = u' '.join(stringSrc) if len(origSrcString) > 4: # if there is a number or a symbol indicating a table of contents at the start of the string extractedNmbrs = utilsString.extractNumbersFromString( origSrcString[:3]) if len( extractedNmbrs ) != 0 or u'-' in origSrcString[:3] or u'.' in origSrcString[:3]: scores.append(0) else: scores.append(1) # add the context to the current scores if contextScores is not None: scores = scores + contextScores # add the location of the sentence in the document to the current scores if placeInDocument is not None: scores = scores + [placeInDocument] return sum(scores) / len(scores)
def tooFewTokens(stringSrc, stringTrgt, nTokens=4): """ given a string sentence pair return 0 if there are less than N tokens on either the src or the trgt and return 1 otherwise """ # if it's not already tokenized if type(stringSrc) is str and type(stringTrgt) is str: stringSrc, stringTrgt = stringSrc.lower(), stringTrgt.lower() addSeparators = [u'.', u',', u':', u'/', u'-', u"''", u"'"] stringSrc = utilsString.nltkTokenizer(stringSrc, addSeparators) stringTrgt = utilsString.nltkTokenizer(stringTrgt, addSeparators) # count the tokens if len(stringSrc) <= nTokens or len(stringTrgt) <= nTokens: return 0 return 1
def transformStringToGizaFormat(string, tokDict, lang, pathToFile): # get the language in the right format lang = getLang(lang, pathToFile) # tokenize # tokList = utilsString.naiveRegexTokenizer(string, language=lang, # capturePunctuation=True, captureSymbols=True) tokList = utilsString.nltkTokenizer(string) # remake a string using the token ids instead of the actual tokens idString = [] for tok in tokList: try: idString.append(tokDict[tok]) except KeyError: try: idString.append(tokDict[u"{0}'".format(tok)]) except KeyError: try: idString.append(tokDict[u"'{0}".format(tok)]) except KeyError: try: idString.append(tokDict[tok.replace(u"'", u"")]) except KeyError: print(tok, "ERROR with token: ", repr(tok), type(tok)) idString = [str(id) for id in idString] return u" ".join(idString)
def cognateCoincidence(stringSrc, stringTrgt, cognateSize=4): """ given a string sentence pair return the ratio of coincidence between the cognates (start of word char ngram) between source and target""" # if it's not already tokenized if type(stringSrc) is str and type(stringTrgt) is str: stringSrc, stringTrgt = stringSrc.lower(), stringTrgt.lower() addSeparators = [u'.', u',', u':', u'/', u'-', u"''", u"'"] stringSrc = utilsString.nltkTokenizer(stringSrc, addSeparators) stringTrgt = utilsString.nltkTokenizer(stringTrgt, addSeparators) # sort by decreasing length of the original word stringSrc.sort(key=lambda tok: len(tok), reverse=True) stringTrgt.sort(key=lambda tok: len(tok), reverse=True) # compile the cognates of each token for the source and target srcCognates = getCognates(stringSrc, cognateSize) trgtCognates = set(getCognates(stringTrgt, cognateSize)) # get intersection of cognates intersection = [cog for cog in srcCognates if cog in trgtCognates] smallerLength = min(len(srcCognates), len(trgtCognates)) if smallerLength == 0: return 0 return len(intersection) / smallerLength
def makeFreqDict(pathToFile, lang=None): freqTokDict = {} with open(pathToFile) as outFile: # first line outLn = outFile.readline() while outLn: outLn = outLn.replace(u"\n", u"") # get the language in the right format lang = getLang(lang, pathToFile) # tokenize # outToks = utilsString.naiveRegexTokenizer(outLn, language=lang, # capturePunctuation=True, captureSymbols=True) outToks = utilsString.nltkTokenizer(outLn) # add to the token freq dict for tok in outToks: if tok not in freqTokDict: freqTokDict[tok] = 0 freqTokDict[tok] += 1 # next line outLn = outFile.readline() return freqTokDict
if len(utilsString.extractNumbersFromString( srcLn[:3])) != 0 or u'-' in srcLn[:3] or u'.' in srcLn[:3]: if docLoc < 0.2: tablMat['dashOrNb']['0.0-0.19'] += 1 elif docLoc < 0.4: tablMat['dashOrNb']['0.2-0.39'] += 1 elif docLoc < 0.6: tablMat['dashOrNb']['0.4-0.59'] += 1 elif docLoc < 0.8: tablMat['dashOrNb']['0.6-0.79'] += 1 else: tablMat['dashOrNb']['0.8-1.0'] += 1 if refLines[srcLnIndex] != u'1.0\n': count[u'small'] += 1 srcLn = utilsString.nltkTokenizer(srcLn, addSeparators) trgtLn = utilsString.nltkTokenizer(trgtLn, addSeparators) # # compile the cognates of each token for the source and target # srcCognates = b003heuristics.getCognates(srcLn, 4) # trgtCognates = set(b003heuristics.getCognates(trgtLn, 4)) # # get intersection of cognates # intersection = [cog for cog in srcCognates if cog in trgtCognates] # lenin = len(intersection) # if lenin not in cogn: # cogn[lenin] = 0 # cogn[lenin] += 1 # sizeSrc = len(srcLn) # sizeTrgt = len(trgtLn) # if abs(sizeSrc-sizeTrgt) not in lenT:
def nbMismatch(stringSrc, stringTrgt, includeNumberNames=True, useEditDistance=True): """ given a string sentence pair, returns a score indicating how much a the numbers in the source appear in the target """ # if it's not already tokenized if type(stringSrc) is str and type(stringTrgt) is str: stringSrc, stringTrgt = stringSrc.lower().replace( u' pm', u'pm'), stringTrgt.lower().replace(u' pm', u'pm') addSeparators = [u'.', u',', u':', u'/', u'-', u'h', u"''", u"'"] stringSrc = utilsString.nltkTokenizer(stringSrc, addSeparators) stringTrgt = utilsString.nltkTokenizer(stringTrgt, addSeparators) # transform all number names in actual numbers if includeNumberNames is True: stringSrcList = utilsString.transformNbNameToNb(stringSrc) stringTrgtList = utilsString.transformNbNameToNb(stringTrgt) # get the tokens containing a digit nbrs = re.compile(r'[0-9]') stringSrcList = [ tok for tok in stringSrc if len(re.findall(nbrs, tok)) != 0 ] stringTrgtList = [ tok for tok in stringTrgt if len(re.findall(nbrs, tok)) != 0 ] # if there were no numbers, return the max score (we can't use this heuristic to evaluate) if len(stringSrcList) + len(stringTrgtList) == 0: return 1.0 # if we want to search for the exact same numbers if useEditDistance == False: # extract the figures from the tokens numbersInSrc = set(getNbsAlone(stringSrcList)) numbersInTrgt = set(getNbsAlone(stringTrgtList)) # if there were no numbers, return the max score (we can't use this heuristic to evaluate) if len(numbersInSrc) + len(numbersInTrgt) == 0: return 1.0 # calculate the score of src-trgt coincidence nbIntersection = numbersInSrc.intersection(numbersInTrgt) print( 1000, len(nbIntersection) / ((len(stringSrcList) + len(stringTrgtList)) / 2), nbIntersection) return len(nbIntersection) / ( (len(numbersInSrc) + len(numbersInTrgt)) / 2) # if we want to use the edit distance to match the source digit tokens with the target ones else: nbIntersection = [] # sort the digitfull src token list by decreasing length stringSrcList.sort(key=lambda tok: len(tok), reverse=True) # make a copy of the target list trgtList = stringTrgtList.copy() for srcTok in stringSrcList: # find the most similar trgt token mostSimil = [None, None, None, 1] for trgtInd, trgtTok in enumerate(trgtList): editDistScore = utilsString.getNormalizedEditDist( srcTok, trgtTok) # get the less distant in the trgt tokens if editDistScore < 0.5 and editDistScore < mostSimil[-1]: mostSimil = [srcTok, trgtTok, trgtInd, editDistScore] # remove the most similar from the trgt list if mostSimil[0] is not None: del trgtList[mostSimil[-2]] nbIntersection.append(tuple(mostSimil[:2])) return len(nbIntersection) / ( (len(stringSrcList) + len(stringTrgtList)) / 2)
def extractMisalignedSP(pathToSrcTrgtFiles, extractionSize=100, typeOfExtractors=[0, 1, 2]): """ given a path to the original source and target files, and the types of extractors to be used returns SP (sentence pairs) extracted as misaligned extractor types: - 0 : same number presence in src and trgt - 1 : 4 or less than 4 tokens - 2 : """ extractedSp = {0: {}, 1: {}, 2: {}} totalLines = 0 # get name of subset for subset in [ u'/ALIGNMENT-QUALITY', u'/MISALIGNED', u'/NOT-FLAGGED', u'/QUALITY' ]: if subset in pathToSrcTrgtFiles: subsetName = subset # type 1 block output1Path = u'./003negativeNaiveExtractors/numberCoincidence/' utilsOs.createEmptyFolder(output1Path) # type 2 block output1Path = u'./003negativeNaiveExtractors/fewTokens/' utilsOs.createEmptyFolder(output1Path) # type 3 block output2Path = u'./003negativeNaiveExtractors/cognates/' utilsOs.createEmptyFolder(output2Path) # get the path to the src and trgt files srcTrgtFiles = utilsOs.goDeepGetFiles(pathToSrcTrgtFiles, format=u'.tmx') print(u'TOTAL FILES : ', len(srcTrgtFiles)) for filePath in srcTrgtFiles: srcFilePath = u'{0}.en'.format( filePath) if u'en-fr' in filePath else u'{0}.fr'.format(filePath) trgtFilePath = u'{0}.fr'.format( filePath) if u'en-fr' in filePath else u'{0}.en'.format(filePath) # open line by line and apply extractors try: with open(srcFilePath) as srcFile: with open(trgtFilePath) as trgtFile: srcLines = srcFile.readlines() trgtLines = trgtFile.readlines() for srcLnIndex, srcLn in enumerate(srcLines): trgtLn = trgtLines[srcLnIndex] # tokenize srcLn = srcLn.lower().replace(u' pm', u'pm') trgtLn = trgtLn.lower().replace(u' pm', u'pm') addSeparators = [ u'.', u',', u':', u'/', u'-', u"''", u"'" ] srcTokens = utilsString.nltkTokenizer( srcLn, addSeparators) trgtTokens = utilsString.nltkTokenizer( trgtLn, addSeparators) # apply the extractors if 0 in typeOfExtractors: extractedSp, score = applyExtractor( nbMismatch, 0.75, srcTokens, trgtTokens, extractedSp, filePath, 0, int(srcLnIndex)) if 1 in typeOfExtractors: # get context scores and location in doc cntxtScores = getContextScores( srcLnIndex, srcLines, trgtLines) docLoc = srcLnIndex / len(srcLines) extractedSp, score = applyExtractor( tableOfContents, 0.32, srcTokens, trgtTokens, extractedSp, filePath, 1, int(srcLnIndex), contextScores=cntxtScores, placeInDocument=docLoc) if 2 in typeOfExtractors: extractedSp, score = applyExtractor( cognateCoincidence, 0.1, srcTokens, trgtTokens, extractedSp, filePath, 2, int(srcLnIndex)) totalLines += len(srcLines) # some folders have no .en and .fr to each .tmx file # (e.g.: '/data/rali8/Tmp/rali/bt/burtrad/corpus_renamed/MISALIGNED/241-CAN_CENT_OCC_HEALTH/SAFE/en-fr/') except FileNotFoundError: pass print(u'TOTAL LINES : ', totalLines) # dump the extracted sp dict into a json file utilsOs.dumpDictToJsonFile( extractedSp, pathOutputFile=u'./003negativeNaiveExtractors/000extractedSp.json', overwrite=True) # randomly extract and dump the file path and the line index for the extracted SP randomlyExtractAndDump(extractedSp, 100, subsetName)