def sqrt_PCFG(G: PCFG): """ Input: a PCFG G Output: a PCFG that is the sqrt of G """ WCFG_rules = {} for S in G.rules: WCFG_rules[S] = { F: (G.rules[S][F][0], G.rules[S][F][1]**(0.5)) for F in G.rules[S] } # Yeah, I know... not exactly a PCFG (probabilities do not sum to 1), but it fits the bill WCFG = PCFG(start=G.start, rules=WCFG_rules) partition_function = compute_partition_function(WCFG) PCFG_rules = {} for S in WCFG.rules: new_rules_S = {} for F in WCFG.rules[S]: args_F = WCFG.rules[S][F][0] w = WCFG.rules[S][F][1] multiplier = prod(partition_function[arg] for arg in args_F) new_rules_S[F] = (args_F, w * multiplier * 1 / partition_function[S]) PCFG_rules[S] = new_rules_S return PCFG(G.start, PCFG_rules)
def __init__(self, corpus_train): self.PCFG = PCFG(corpus_train) self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens) self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)} self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon} for tag in self.PCFG.lexicon: for word in self.PCFG.lexicon[tag]: self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word] # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ) self.grammar_dicts = {} for (root_tag, rules) in self.PCFG.grammar.items(): # root_tag is the left hand tag of the grammar rule idx_root_tag = self.tag_to_id[root_tag] self.grammar_dicts[idx_root_tag] = {} dico = {} for (split, proba) in rules.items(): # split is the right hand term, and proba the probability of the rule idx_left_tag = self.tag_to_id[split[0]] idx_right_tag = self.tag_to_id[split[1]] if idx_left_tag in dico.keys(): dico[idx_left_tag][idx_right_tag] = proba else: dico[idx_left_tag] = {idx_right_tag: proba} self.grammar_dicts[idx_root_tag] = dico
def multi_f(sentence): grammar_file = sys.argv[1] pcfg = PCFG() pcfg.load_model(grammar_file) parser = Parser(pcfg) tree = parser.parse(sentence) print(dumps(tree))
def pcfg(sentence): pcfg = PCFG() pcfg.readCFGRules(FilePath.ROOT + "rules.txt") #pcfg.showRules() pcfg.parse(sentence) pcfg.showTrees() pass
def __init__(self, corpus): # PCFG and OOV class self.pcfg = PCFG(corpus) self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags, self.pcfg.tokens) # Initialize CYP probability matrix self.proba_matrix = None self.cyk_matrix = None
def main(): train_data_filename = 'parse_train.dat' train_rare_filename = 'p1.train.rare.dat' pcfg_model_filename = 'parser_train.counts.out' pcfg = PCFG() for l in open(train_data_filename): t = json.loads(l) pcfg.count(t) pcfg.count_word() process_rare_words(open(train_data_filename), open(train_rare_filename, 'w'), pcfg.rare_words, rare_words_rule_p1) new_pcfg = PCFG() for l in open(train_rare_filename): t = json.loads(l) new_pcfg.count(t) new_pcfg.cal_rule_params() new_pcfg.write(open(pcfg_model_filename, 'w'))
def train(train_data_filename, train_rare_filename, pcfg_model_filename, rare_words_rule): print 'train PCFG model' pcfg = PCFG() for l in open(train_data_filename): t = json.loads(l) pcfg.count(t) pcfg.count_word() print 'process rare word' process_rare_words(open(train_data_filename), open(train_rare_filename, 'w'), pcfg.rare_words, rare_words_rule) print 'train PCFG model again' new_pcfg = PCFG() for l in open(train_rare_filename): t = json.loads(l) new_pcfg.count(t) new_pcfg.cal_rule_params() new_pcfg.write(open(pcfg_model_filename, 'w')) return new_pcfg
def build_model(): pcfg = PCFG() if exists(MODEL): pcfg.load_model(MODEL) else: print "Building the Grammar Model" start = time() if not exists(TEMP_DIR): makedirs(TEMP_DIR) # Normalise the treebanks if not exists(QUESTIONBANK_NORM): normalize_questionbank(QUESTIONBANK_DATA, QUESTIONBANK_PENN_DATA) gen_norm(QUESTIONBANK_NORM, [QUESTIONBANK_PENN_DATA]) if not exists(PENNTREEBANK_NORM): gen_norm(PENNTREEBANK_NORM, glob(PENNTREEBANK_GLOB)) # Keep a part of the treebanks for testing i = 0 with open(MODEL_TREEBANK, 'w') as model, open(TEST_DAT, 'w') as dat, open(TEST_KEY, 'w') as key: for treebank in [QUESTIONBANK_NORM, PENNTREEBANK_NORM]: for tree in open(treebank): i += 1 if (i % 100) == 0: sentence, n = get_sentence(loads(tree)) if n > 7 and n < 20: dat.write(sentence+'\n') key.write(tree) else: i -= 1 model.write(tree) # Learn PCFG pcfg.learn_from_treebanks([MODEL_TREEBANK]) pcfg.save_model(MODEL) print "Time: (%.2f)s\n" % (time() - start) return pcfg
def __init__(self, corpus_train): self.PCFG = PCFG(corpus_train) self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols, self.PCFG.freq_tokens) #note : if the id of a symbol is above self.PCFG.nb_tags, #it's an artificial symbol introduced with Chomsky normalization self.symbol_to_id = { symbol: i for (i, symbol) in enumerate(self.PCFG.list_all_symbols) } #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids: #we store rules with an additional hierarchical level for speed up #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ) #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries self.grammar_ids = {} for (root_tag, rules) in self.PCFG.grammar.items(): # root_tag is the left hand symbol of the grammar rule idx_root_tag = self.symbol_to_id[root_tag] self.grammar_ids[idx_root_tag] = {} dico = {} for (split, proba) in rules.items( ): #split is the right hand term, and proba the probability of the rule idx_left_tag = self.symbol_to_id[split[0]] idx_right_tag = self.symbol_to_id[split[1]] if idx_left_tag in dico.keys(): dico[idx_left_tag][idx_right_tag] = proba else: dico[idx_left_tag] = {idx_right_tag: proba} self.grammar_ids[idx_root_tag] = dico #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ? #this is what stores self.lexicon_inverted self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon} for tag in self.PCFG.lexicon: for word in self.PCFG.lexicon[tag]: self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]
def run(args): data = loader.load_treebanks(TREEBANK_PATH) train_data, dev_data, test_data = loader.train_test_split( data, 0.8, 0.1, 0.1) words, embeddings = loader.load_word_embeddings(EMBEDDING_PATH) pcfg = PCFG(train_data) pcfg.train(train_data) pcfg.set_oov(OOV, words, embeddings) if args.generate_output: output = pcfg.generate_output(test_data) if args.evaluation: accs, nb_no_parse = pcfg.predict(test_data[:2]) if args.parse: corpus = [] with open(args.txt_path, 'r') as f: corpus = f.read().split('\n') pcfg.parse_from_txt(corpus)
def run(args): has_effect = False if args: try: train_corpus, val_corpus, test_corpus = data.get_train_val_test() words, embeddings = data.get_polyglot_words_embeddings() parser = PCFG() parser.learn_probabilities_and_rules(train_corpus) parser.set_oov_module(OovModule, words, embeddings) if args.inference: get_gold(parser, test_corpus, filename='evaluation_data.gold') get_predictions(parser, test_corpus, filename='evaluation_data.parser_output') if args.evaluation: evaluation('evaluation_data.gold', 'evaluation_data.parser_output') if args.parse: parser.parse_from_txt(args.txt_path) except Exception as e: logger.exception(e) logger.error("Uhoh, the script halted with an error.") else: if not has_effect: logger.error( "Script halted without any effect. To run code, use command:\npython3 main.py <args>" )
def multi_f(sentence): grammar_file = sys.argv[1] pcfg = PCFG() pcfg.load_model(grammar_file) parser = Parser(pcfg) return parser.parse(sentence) if __name__ == "__main__": if len(sys.argv) != 2: print("usage: python3 parser.py GRAMMAR") exit() start = time() grammar_file = sys.argv[1] print("Loading grammar from " + grammar_file + " ...", file=stderr) pcfg = PCFG() pcfg.load_model(grammar_file) parser = Parser(pcfg) print("Parsing sentences ...", file=stderr) with Pool(processes = os.cpu_count()) as pool: trees = pool.map(multi_f, stdin.readlines()) for t in trees: print(dumps(t)) print("Time: (%.2f)s\n" % (time() - start), file=stderr)
from pcfg import PCFG import argparse parser = argparse.ArgumentParser() parser.add_argument("--corpus", help="training treebank corpus", type=str) parser.add_argument("--sentences", help="raw token sentences", type=str) parser.add_argument("--outfile", help="name of the output file", type=str) args = parser.parse_args() grammar = PCFG(args.corpus) grammar.parse_corpus() grammar.predict(args.sentences, args.outfile)
sys.exit(1) # load the train file to trees trees = [] f = open(trainfilename, 'r') for line in f: trees.append(nltk.Tree.fromstring(line)) # preprocss the tree forms: ignore functional labels and binarize to CNF for tree in trees: # ignore_func_labels(tree) tree.chomsky_normal_form(horzMarkov=2) # tree.chomsky_normal_form() # learn PCFG lexicon, grammar, vocabulary, symbols = PCFG(trees) # print(grammar) # for OOV oovwords = OOV(embedfilename, vocabulary) # parse new sentences using CYK based on learned PCFG # parser = CYKSolver(lexicon, grammar, vocabulary, symbols, oovwords) # i = 0 for line in sys.stdin: # print('start parse') # print(line) # start = time.time() # if line == '\n': continue # cyksolver = CYK(line.split(), lexicon, grammar, vocabulary, symbols, embedfilename)
def multi_f(sentence): grammar_file = sys.argv[1] pcfg = PCFG() pcfg.load_model(grammar_file) parser = Parser(pcfg) return parser.parse(sentence)
if args.fnc == "generate": print "### GENERATING WORDS ###" print "model:", args.model, args.n print "language:", args.lang if args.model == "nphone": lm = NgramModel(args.n, corpus, 1) elif args.model == "nsyll": if args.lex.startswith("celexes/syll"): lm = NsyllModel(args.n, corpus, 1) else: print "Use syll__ file for this model" sys.exit() elif args.model == "pcfg": if args.lex.startswith("celexes/pcfg"): print call(["./pcfg/io","-d","1","-g", args.grammar, args.lex],stdout = open('grammars/gram_pcfg.wlt', 'w')) lm = PCFG('grammars/gram_pcfg.wlt') corpus = [re.sub(" ","",x) for x in corpus] else: print "Use pcfg__ file for this model" sys.exit() lm.create_model(corpus, args.smoothing) o = "Lexicons/lex_" + args.lex.split("/")[-1][:-4] + "_cv" + str(args.cv) + "_iter" + str(args.iter) + "_m" + args.model + "_n" + str(args.n) + "_smoothing" + str(args.smoothing) + ".txt" lexfile = write_lex_file(o, corpus, args.cv, args.iter, lm, args.h**o) print "null lexicons wrote on", lexfile print "### WRITING RESULTS ###" write_all(lexfile, args.graph, args.lang) else: o = "evaluation/eval_" + args.lex.split("/")[-1][:-4] + "_cv" + str(args.cv) + "_iter" + str(args.iter) + "_m" + args.model + "_n" + str(args.n) + "_smoothing" + str(args.smoothing)+ ".txt" out = open(o, 'w')
# !/usr/bin/env python3 # -*- coding: utf-8 -*- # -------------------------------------------# # main.py # # author: sean lee # # qq: 929325776 # # email: [email protected] # #--------------------------------------------# from pcfg import PCFG parser = PCFG() parser.fit('./corpus/toy/train.txt') parser.parse("the man saw the dog") ''' print(parser.N_dict) print(parser.NR_dict) print(parser.TR_dict) '''