Example #1
0
 def __init__(self):
     self.binary_q = {}
     self.unary_q = {}
     self.counter = Counts()
     self.counter_n = Counts()
     self.pi = []
     self.bp = []
     self.parser = {}
     self.use_vert = False
     #0.95 curr best
     self.beta = 0
     self.vm = "\^\<[A-Z]+[+A-Z]*\>"
     self.vocab = {}
Example #2
0
    def __init__(self, parse_file):
        counter = Counts()
        for l in open(parse_file):
            tree = json.loads(l)
            counter.count(tree)

        #N is 'Non-terminal'=> N[symbol] = count
        self.N = counter.nonterm
        #binary_R is 'binary Rule'  binary_R[symbol, y1, y2] = count
        self.binary_R = counter.binary
        #unary will be on this form=> unary[symbol, word] = count
        self.unary_R = counter.unary
        #V is 'vocabulary' and there are 245 words in it.
        self.V = counter.vocabulary  #245
        # pi[i, i, x] = probability
        #'pi' is a dictionary, 'i' is the index of the word in the sentence, 'x' is the non-terminal symbol
        #'pi[i, i, x]' = 0.03 where 0.03 is the probability of the word at 'i' being assigned to 'x'
        self.pi = {}
        #'bp' stands for 'back pointer' and it's a dictionary that maps the best rule and best_split to
        #the binary table
        #'bp[i, i, x]' = [(x,word),i]
        self.bp = {}
        #binary_table is a dictionary that collects the binary rules derived from one symbol
        #binary_table['S'] =  [ ('NP', 'VP'), ('NP', 'VP+VERB'), ...]
        self.binary_table = defaultdict(set)
        self.initialize_binary_table()
Example #3
0
 def __init__(self):
     self.binary_q = {}
     self.unary_q = {}
     self.counter = Counts()
     self.pi = []
     self.bp = []
     self.parser = {}
     self.use_vert = False
Example #4
0
import json
from count_cfg_freq import Counts

counter = Counts()
vocab = {}


def loadData(corpus_file):
    l = corpus_file.readline()
    while (l):
        t = json.loads(l)
        counter.count(t)
        wordList = [y for x, y in extract_tag(t)]
        for word in wordList:
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1
        #for tag in tagList
        l = corpus_file.readline()


def extract_tag(t):
    if len(t) == 3:
        return extract_tag(t[1]) + extract_tag(t[2])
    if len(t) == 2:
        return [(t[0], t[1])]


def tagRareWord(corpus_file, new_corpus_file):
    l = corpus_file.readline()
    while (l):