def main(): WIN_SIZE = 5 compressor = WindowCompressor() with open("toLexicalize.pickle", "rb") as f: lexicalizedTags = pickle.load(f) tagWindows = Counter() for part in range(0, 4): hasDisplayed = False tagsFile = open("C:/MissingWord/train/tagsPart"+str(part)+".txt", "r", encoding = "utf-8") tokensFile = open("C:/MissingWord/train/corpusPart"+str(part)+".txt", "r", encoding = "utf-8") numLines = 0 while True: numLines += 1 if numLines % 10000 == 0: print(numLines, len(tagWindows)) tagLine = tagsFile.readline() tokenLine = tokensFile.readline() if len(tagLine) > 0: tags = tagLine.strip().split("|") tokens = tokenLine.strip().split(" ") tags = lexicalizedTagWindows.lexicalizeTags(tags, tokens, lexicalizedTags) if len(tags) >= 3: tags.pop(random.randint(1, len(tags) - 2)) windows = compressor.compressList(generateTagWindows.makeWindows(tags, size = WIN_SIZE)) tagWindows.update(windows) if hasDisplayed == False: print(windows) hasDisplayed = True else: break with open("C:/MissingWord/modLexComp"+str(WIN_SIZE)+".pickle", "wb") as f: pickle.dump(tagWindows, f)
def makeFeatures(synCor, synCorLex, synRepl, cutTokens, removedIndex): corrProbs, corrConfidence = synCor.correct(cutTokens) probWindows = generateTagWindows.makeWindows(corrProbs, size = 9, filler = 0.0) #corrConfWindows = generateTagWindows.makeWindows(corrConfidence, size = 3, filler = 0.0) window = list(probWindows[removedIndex]) window.append(max(probWindows[removedIndex]) - max(corrProbs)) # difference between top probability in window and top probability in all possible locations #window.extend(corrConfWindows[removedIndex]) lexCorrProbs, lexCorrConfidence = synCorLex.correct(cutTokens) lexCorrProbWindows = generateTagWindows.makeWindows(lexCorrProbs, size = 9, filler = 0.0) #lexCorrConfWindows = generateTagWindows.makeWindows(lexCorrConfidence, size = 3, filler = 0.0) window.extend(lexCorrProbWindows[removedIndex]) #window.extend(lexCorrConfWindows[removedIndex]) window.append(max(lexCorrProbWindows[removedIndex]) - max(lexCorrProbs)) window.append(math.log(len(cutTokens))) replProbs = synRepl.fix(cutTokens, removedIndex) window.extend(replProbs[:5]) return window
def main(): WIN_SIZE = 4 LEX = True lexFilename = '' if LEX: lexFilename = 'lex' NUM_SAMPLES = 5 #will bias the confidence, shifting it up as NUM_SAMPLES goes up compressor = WindowCompressor() with open("toLexicalize.pickle", "rb") as f: lexicalizedTags = pickle.load(f) tagModWindows = Counter() tagWindows = Counter() for part in range(0, 4): hasDisplayed = False tagsFile = open("C:/MissingWord/train/tagsPart"+str(part)+".txt", "r", encoding = "utf-8") tokensFile = open("C:/MissingWord/train/corpusPart"+str(part)+".txt", "r", encoding = "utf-8") numLines = 0 while True: numLines += 1 if numLines % 10000 == 0: print(numLines, len(tagWindows)) tagLine = tagsFile.readline() tokenLine = tokensFile.readline() if len(tagLine) > 0: tags = tagLine.strip().split("|") tokens = tokenLine.strip().split(" ") if len(tags) != len(tokens) or len(tags) < 3: continue if LEX: origTags = lexicalizedTagWindows.lexicalizeTags(tags, tokens, lexicalizedTags) else: origTags = tags for poppedTagIndex in random.sample(range(1, len(tags) - 2), min(len(range(1, len(tags) - 2)), NUM_SAMPLES)): tags = origTags.copy() tags.pop(poppedTagIndex) affectedRange = range(poppedTagIndex - 2, poppedTagIndex + 3) windows = compressor.compressList(generateTagWindows.makeWindows(tags, size = WIN_SIZE)) tagModWindows.update([window for index, window in enumerate(windows) if index in affectedRange]) tagWindows.update([window for index, window in enumerate(windows) if index not in affectedRange]) if hasDisplayed == False: print(windows) hasDisplayed = True else: break if LEX: lexFilename = "Lex" with open("C:/MissingWord/post"+lexFilename+"Comp"+str(WIN_SIZE)+".pickle", "wb") as f: pickle.dump(tagWindows, f) with open("C:/MissingWord/post"+lexFilename+"ModComp"+str(WIN_SIZE)+".pickle", "wb") as f: pickle.dump(tagModWindows, f)
def correct(self, tokens): blob = TextBlob(' '.join(tokens), pos_tagger = self.aptagger) completeTags = generateTagWindows.getCompleteTags(blob) if self.lex == True: completeTags = lexicalizedTagWindows.lexicalizeTags(completeTags, tokens, self.toLexicalize) #print(lexicalizedTags) windows = generateTagWindows.makeWindows(completeTags, size = self.winSize) probs = [] confidence = [] for window in windows: prob = (self.winMod.count(window) + 1) / ((self.winMod.count(window) + self.winOrig.count(window)) + 1) probs.append(prob) confidence.append(math.log(self.winMod.count(window) + self.winOrig.count(window) + 1)) #if self.winSize == 5: # print(self.winMod.count(window) + self.winOrig.count(window)) return probs, confidence
def main(): WIN_SIZE = 5 compressor = WindowCompressor() with open("toLexicalize.pickle", "rb") as f: lexicalizedTags = pickle.load(f) tagWindows = Counter() for part in range(0, 4): hasDisplayed = False tagsFile = open("C:/MissingWord/train/tagsPart" + str(part) + ".txt", "r", encoding="utf-8") tokensFile = open("C:/MissingWord/train/corpusPart" + str(part) + ".txt", "r", encoding="utf-8") numLines = 0 while True: numLines += 1 if numLines % 10000 == 0: print(numLines, len(tagWindows)) tagLine = tagsFile.readline() tokenLine = tokensFile.readline() if len(tagLine) > 0: tags = tagLine.strip().split("|") tokens = tokenLine.strip().split(" ") tags = lexicalizedTagWindows.lexicalizeTags( tags, tokens, lexicalizedTags) if len(tags) >= 3: tags.pop(random.randint(1, len(tags) - 2)) windows = compressor.compressList( generateTagWindows.makeWindows(tags, size=WIN_SIZE)) tagWindows.update(windows) if hasDisplayed == False: print(windows) hasDisplayed = True else: break with open("C:/MissingWord/modLexComp" + str(WIN_SIZE) + ".pickle", "wb") as f: pickle.dump(tagWindows, f)
def main(): WIN_SIZE = 4 LEX = True lexFilename = '' if LEX: lexFilename = 'lex' NUM_SAMPLES = 5 #will bias the confidence, shifting it up as NUM_SAMPLES goes up compressor = WindowCompressor() with open("toLexicalize.pickle", "rb") as f: lexicalizedTags = pickle.load(f) tagModWindows = Counter() tagWindows = Counter() for part in range(0, 4): hasDisplayed = False tagsFile = open("C:/MissingWord/train/tagsPart" + str(part) + ".txt", "r", encoding="utf-8") tokensFile = open("C:/MissingWord/train/corpusPart" + str(part) + ".txt", "r", encoding="utf-8") numLines = 0 while True: numLines += 1 if numLines % 10000 == 0: print(numLines, len(tagWindows)) tagLine = tagsFile.readline() tokenLine = tokensFile.readline() if len(tagLine) > 0: tags = tagLine.strip().split("|") tokens = tokenLine.strip().split(" ") if len(tags) != len(tokens) or len(tags) < 3: continue if LEX: origTags = lexicalizedTagWindows.lexicalizeTags( tags, tokens, lexicalizedTags) else: origTags = tags for poppedTagIndex in random.sample( range(1, len(tags) - 2), min(len(range(1, len(tags) - 2)), NUM_SAMPLES)): tags = origTags.copy() tags.pop(poppedTagIndex) affectedRange = range(poppedTagIndex - 2, poppedTagIndex + 3) windows = compressor.compressList( generateTagWindows.makeWindows(tags, size=WIN_SIZE)) tagModWindows.update([ window for index, window in enumerate(windows) if index in affectedRange ]) tagWindows.update([ window for index, window in enumerate(windows) if index not in affectedRange ]) if hasDisplayed == False: print(windows) hasDisplayed = True else: break if LEX: lexFilename = "Lex" with open( "C:/MissingWord/post" + lexFilename + "Comp" + str(WIN_SIZE) + ".pickle", "wb") as f: pickle.dump(tagWindows, f) with open( "C:/MissingWord/post" + lexFilename + "ModComp" + str(WIN_SIZE) + ".pickle", "wb") as f: pickle.dump(tagModWindows, f)
__author__ = 'SEOKHO' import generateTagWindows from collections import Counter import pickle counter = Counter() for i in range(0, 4): with open("C:/MissingWord/train/cleanTokensPart"+str(i)+".txt", "r", encoding = 'utf-8') as f: for index, line in enumerate(f): if index % 100000 == 0: print(index, len(counter)) counter.update(generateTagWindows.makeWindows(line.split(" "), size = 2)) with open("C:/MissingWord/bigramCounts.pickle", "wb") as f: pickle.dump(counter, f)