def __init__(self): self.binary_q = {} self.unary_q = {} self.counter = Counts() self.counter_n = Counts() self.pi = [] self.bp = [] self.parser = {} self.use_vert = False #0.95 curr best self.beta = 0 self.vm = "\^\<[A-Z]+[+A-Z]*\>" self.vocab = {}
def __init__(self, parse_file): counter = Counts() for l in open(parse_file): tree = json.loads(l) counter.count(tree) #N is 'Non-terminal'=> N[symbol] = count self.N = counter.nonterm #binary_R is 'binary Rule' binary_R[symbol, y1, y2] = count self.binary_R = counter.binary #unary will be on this form=> unary[symbol, word] = count self.unary_R = counter.unary #V is 'vocabulary' and there are 245 words in it. self.V = counter.vocabulary #245 # pi[i, i, x] = probability #'pi' is a dictionary, 'i' is the index of the word in the sentence, 'x' is the non-terminal symbol #'pi[i, i, x]' = 0.03 where 0.03 is the probability of the word at 'i' being assigned to 'x' self.pi = {} #'bp' stands for 'back pointer' and it's a dictionary that maps the best rule and best_split to #the binary table #'bp[i, i, x]' = [(x,word),i] self.bp = {} #binary_table is a dictionary that collects the binary rules derived from one symbol #binary_table['S'] = [ ('NP', 'VP'), ('NP', 'VP+VERB'), ...] self.binary_table = defaultdict(set) self.initialize_binary_table()
def __init__(self): self.binary_q = {} self.unary_q = {} self.counter = Counts() self.pi = [] self.bp = [] self.parser = {} self.use_vert = False
import json from count_cfg_freq import Counts counter = Counts() vocab = {} def loadData(corpus_file): l = corpus_file.readline() while (l): t = json.loads(l) counter.count(t) wordList = [y for x, y in extract_tag(t)] for word in wordList: if word not in vocab: vocab[word] = 0 vocab[word] += 1 #for tag in tagList l = corpus_file.readline() def extract_tag(t): if len(t) == 3: return extract_tag(t[1]) + extract_tag(t[2]) if len(t) == 2: return [(t[0], t[1])] def tagRareWord(corpus_file, new_corpus_file): l = corpus_file.readline() while (l):