Esempio n. 1
0
def main():
    WIN_SIZE = 5
    compressor = WindowCompressor()
    with open("toLexicalize.pickle", "rb") as f:
        lexicalizedTags = pickle.load(f)

    tagWindows = Counter()
    for part in range(0, 4):
        hasDisplayed = False
        tagsFile = open("C:/MissingWord/train/tagsPart"+str(part)+".txt", "r", encoding = "utf-8")
        tokensFile = open("C:/MissingWord/train/corpusPart"+str(part)+".txt", "r", encoding = "utf-8")
        numLines = 0
        while True:
            numLines += 1
            if numLines % 10000 == 0:
                print(numLines, len(tagWindows))
            tagLine = tagsFile.readline()
            tokenLine = tokensFile.readline()

            if len(tagLine) > 0:
                tags = tagLine.strip().split("|")
                tokens = tokenLine.strip().split(" ")
                tags = lexicalizedTagWindows.lexicalizeTags(tags, tokens, lexicalizedTags)
                if len(tags) >= 3:
                    tags.pop(random.randint(1, len(tags) - 2))
                windows = compressor.compressList(generateTagWindows.makeWindows(tags, size = WIN_SIZE))
                tagWindows.update(windows)
                if hasDisplayed == False:
                    print(windows)
                    hasDisplayed = True
            else:
                break
    with open("C:/MissingWord/modLexComp"+str(WIN_SIZE)+".pickle", "wb") as f:
        pickle.dump(tagWindows, f)
def makeFeatures(synCor, synCorLex, synRepl, cutTokens, removedIndex):
    corrProbs, corrConfidence = synCor.correct(cutTokens)
    probWindows = generateTagWindows.makeWindows(corrProbs, size = 9, filler = 0.0)
    #corrConfWindows = generateTagWindows.makeWindows(corrConfidence, size = 3, filler = 0.0)
    window = list(probWindows[removedIndex])
    window.append(max(probWindows[removedIndex]) - max(corrProbs)) # difference between top probability in window and top probability in all possible locations
    #window.extend(corrConfWindows[removedIndex])
    lexCorrProbs, lexCorrConfidence = synCorLex.correct(cutTokens)
    lexCorrProbWindows = generateTagWindows.makeWindows(lexCorrProbs, size = 9, filler = 0.0)
    #lexCorrConfWindows = generateTagWindows.makeWindows(lexCorrConfidence, size = 3, filler = 0.0)
    window.extend(lexCorrProbWindows[removedIndex])
    #window.extend(lexCorrConfWindows[removedIndex])
    window.append(max(lexCorrProbWindows[removedIndex]) - max(lexCorrProbs))
    window.append(math.log(len(cutTokens)))
    replProbs = synRepl.fix(cutTokens, removedIndex)
    window.extend(replProbs[:5])
    return window
Esempio n. 3
0
def main():
    WIN_SIZE = 4
    LEX = True
    lexFilename = ''
    if LEX:
        lexFilename = 'lex'
    NUM_SAMPLES = 5 #will bias the confidence, shifting it up as NUM_SAMPLES goes up
    compressor = WindowCompressor()
    with open("toLexicalize.pickle", "rb") as f:
        lexicalizedTags = pickle.load(f)

    tagModWindows = Counter()
    tagWindows = Counter()
    for part in range(0, 4):
        hasDisplayed = False
        tagsFile = open("C:/MissingWord/train/tagsPart"+str(part)+".txt", "r", encoding = "utf-8")
        tokensFile = open("C:/MissingWord/train/corpusPart"+str(part)+".txt", "r", encoding = "utf-8")
        numLines = 0
        while True:
            numLines += 1
            if numLines % 10000 == 0:
                print(numLines, len(tagWindows))
            tagLine = tagsFile.readline()
            tokenLine = tokensFile.readline()

            if len(tagLine) > 0:
                tags = tagLine.strip().split("|")
                tokens = tokenLine.strip().split(" ")
                if len(tags) != len(tokens) or len(tags) < 3:
                    continue
                if LEX:
                    origTags = lexicalizedTagWindows.lexicalizeTags(tags, tokens, lexicalizedTags)
                else:
                    origTags = tags
                for poppedTagIndex in random.sample(range(1, len(tags) - 2), min(len(range(1, len(tags) - 2)), NUM_SAMPLES)):
                    tags = origTags.copy()
                    tags.pop(poppedTagIndex)
                    affectedRange = range(poppedTagIndex - 2, poppedTagIndex + 3)
                    windows = compressor.compressList(generateTagWindows.makeWindows(tags, size = WIN_SIZE))
                    tagModWindows.update([window for index, window in enumerate(windows) if index in affectedRange])
                    tagWindows.update([window for index, window in enumerate(windows) if index not in affectedRange])
                    if hasDisplayed == False:
                        print(windows)
                        hasDisplayed = True
            else:
                break

    if LEX:
        lexFilename = "Lex"

    with open("C:/MissingWord/post"+lexFilename+"Comp"+str(WIN_SIZE)+".pickle", "wb") as f:
        pickle.dump(tagWindows, f)

    with open("C:/MissingWord/post"+lexFilename+"ModComp"+str(WIN_SIZE)+".pickle", "wb") as f:
        pickle.dump(tagModWindows, f)
 def correct(self, tokens):
     blob = TextBlob(' '.join(tokens), pos_tagger = self.aptagger)
     completeTags = generateTagWindows.getCompleteTags(blob)
     if self.lex == True:
         completeTags = lexicalizedTagWindows.lexicalizeTags(completeTags, tokens, self.toLexicalize)
     #print(lexicalizedTags)
     windows = generateTagWindows.makeWindows(completeTags, size = self.winSize)
     probs = []
     confidence = []
     for window in windows:
         prob = (self.winMod.count(window) + 1) / ((self.winMod.count(window) + self.winOrig.count(window)) + 1)
         probs.append(prob)
         confidence.append(math.log(self.winMod.count(window) + self.winOrig.count(window) + 1))
         #if self.winSize == 5:
         #    print(self.winMod.count(window) + self.winOrig.count(window))
     return probs, confidence
Esempio n. 5
0
def main():
    WIN_SIZE = 5
    compressor = WindowCompressor()
    with open("toLexicalize.pickle", "rb") as f:
        lexicalizedTags = pickle.load(f)

    tagWindows = Counter()
    for part in range(0, 4):
        hasDisplayed = False
        tagsFile = open("C:/MissingWord/train/tagsPart" + str(part) + ".txt",
                        "r",
                        encoding="utf-8")
        tokensFile = open("C:/MissingWord/train/corpusPart" + str(part) +
                          ".txt",
                          "r",
                          encoding="utf-8")
        numLines = 0
        while True:
            numLines += 1
            if numLines % 10000 == 0:
                print(numLines, len(tagWindows))
            tagLine = tagsFile.readline()
            tokenLine = tokensFile.readline()

            if len(tagLine) > 0:
                tags = tagLine.strip().split("|")
                tokens = tokenLine.strip().split(" ")
                tags = lexicalizedTagWindows.lexicalizeTags(
                    tags, tokens, lexicalizedTags)
                if len(tags) >= 3:
                    tags.pop(random.randint(1, len(tags) - 2))
                windows = compressor.compressList(
                    generateTagWindows.makeWindows(tags, size=WIN_SIZE))
                tagWindows.update(windows)
                if hasDisplayed == False:
                    print(windows)
                    hasDisplayed = True
            else:
                break
    with open("C:/MissingWord/modLexComp" + str(WIN_SIZE) + ".pickle",
              "wb") as f:
        pickle.dump(tagWindows, f)
Esempio n. 6
0
def main():
    WIN_SIZE = 4
    LEX = True
    lexFilename = ''
    if LEX:
        lexFilename = 'lex'
    NUM_SAMPLES = 5  #will bias the confidence, shifting it up as NUM_SAMPLES goes up
    compressor = WindowCompressor()
    with open("toLexicalize.pickle", "rb") as f:
        lexicalizedTags = pickle.load(f)

    tagModWindows = Counter()
    tagWindows = Counter()
    for part in range(0, 4):
        hasDisplayed = False
        tagsFile = open("C:/MissingWord/train/tagsPart" + str(part) + ".txt",
                        "r",
                        encoding="utf-8")
        tokensFile = open("C:/MissingWord/train/corpusPart" + str(part) +
                          ".txt",
                          "r",
                          encoding="utf-8")
        numLines = 0
        while True:
            numLines += 1
            if numLines % 10000 == 0:
                print(numLines, len(tagWindows))
            tagLine = tagsFile.readline()
            tokenLine = tokensFile.readline()

            if len(tagLine) > 0:
                tags = tagLine.strip().split("|")
                tokens = tokenLine.strip().split(" ")
                if len(tags) != len(tokens) or len(tags) < 3:
                    continue
                if LEX:
                    origTags = lexicalizedTagWindows.lexicalizeTags(
                        tags, tokens, lexicalizedTags)
                else:
                    origTags = tags
                for poppedTagIndex in random.sample(
                        range(1,
                              len(tags) - 2),
                        min(len(range(1,
                                      len(tags) - 2)), NUM_SAMPLES)):
                    tags = origTags.copy()
                    tags.pop(poppedTagIndex)
                    affectedRange = range(poppedTagIndex - 2,
                                          poppedTagIndex + 3)
                    windows = compressor.compressList(
                        generateTagWindows.makeWindows(tags, size=WIN_SIZE))
                    tagModWindows.update([
                        window for index, window in enumerate(windows)
                        if index in affectedRange
                    ])
                    tagWindows.update([
                        window for index, window in enumerate(windows)
                        if index not in affectedRange
                    ])
                    if hasDisplayed == False:
                        print(windows)
                        hasDisplayed = True
            else:
                break

    if LEX:
        lexFilename = "Lex"

    with open(
            "C:/MissingWord/post" + lexFilename + "Comp" + str(WIN_SIZE) +
            ".pickle", "wb") as f:
        pickle.dump(tagWindows, f)

    with open(
            "C:/MissingWord/post" + lexFilename + "ModComp" + str(WIN_SIZE) +
            ".pickle", "wb") as f:
        pickle.dump(tagModWindows, f)
__author__ = 'SEOKHO'

import generateTagWindows
from collections import Counter
import pickle

counter = Counter()

for i in range(0, 4):
    with open("C:/MissingWord/train/cleanTokensPart"+str(i)+".txt", "r", encoding = 'utf-8') as f:
        for index, line in enumerate(f):
            if index % 100000 == 0:
                print(index, len(counter))
            counter.update(generateTagWindows.makeWindows(line.split(" "), size = 2))

with open("C:/MissingWord/bigramCounts.pickle", "wb") as f:
    pickle.dump(counter, f)