Python PCFG.PCFG Examples, pcfg.PCFG.PCFG Python Examples

Example #1

0

Show file

File: sqrt_sampling.py Project: nathanael-fijalkow/ec

def sqrt_PCFG(G: PCFG):
    """
    Input: a PCFG G
    Output: a PCFG that is the sqrt of G
    """
    WCFG_rules = {}
    for S in G.rules:
        WCFG_rules[S] = {
            F: (G.rules[S][F][0], G.rules[S][F][1]**(0.5))
            for F in G.rules[S]
        }

    # Yeah, I know... not exactly a PCFG (probabilities do not sum to 1), but it fits the bill
    WCFG = PCFG(start=G.start, rules=WCFG_rules)
    partition_function = compute_partition_function(WCFG)

    PCFG_rules = {}
    for S in WCFG.rules:
        new_rules_S = {}
        for F in WCFG.rules[S]:
            args_F = WCFG.rules[S][F][0]
            w = WCFG.rules[S][F][1]
            multiplier = prod(partition_function[arg] for arg in args_F)
            new_rules_S[F] = (args_F,
                              w * multiplier * 1 / partition_function[S])
        PCFG_rules[S] = new_rules_S
    return PCFG(G.start, PCFG_rules)

Example #2

0

Show file

    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens)

        self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)}

        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

        # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ)
        self.grammar_dicts = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand tag of the grammar rule
            idx_root_tag = self.tag_to_id[root_tag]
            self.grammar_dicts[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items():  # split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.tag_to_id[split[0]]
                idx_right_tag = self.tag_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_dicts[idx_root_tag] = dico

Example #3

0

Show file

def multi_f(sentence):
    grammar_file = sys.argv[1]
    pcfg = PCFG()
    pcfg.load_model(grammar_file)
    parser = Parser(pcfg)
    tree = parser.parse(sentence)
    print(dumps(tree))

Example #4

0

Show file

File: main.py Project: Rigeru/NLP

def pcfg(sentence):
    pcfg = PCFG()
    pcfg.readCFGRules(FilePath.ROOT + "rules.txt")
    #pcfg.showRules()
    pcfg.parse(sentence)
    pcfg.showTrees()
    pass

Example #5

0

Show file

File: cyk.py Project: ganshofp/MVA_NLP_Projects

    def __init__(self, corpus):

        # PCFG and OOV class
        self.pcfg = PCFG(corpus)
        self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags,
                       self.pcfg.tokens)

        # Initialize CYP probability matrix
        self.proba_matrix = None
        self.cyk_matrix = None

Example #6

0

Show file

File: p1.py Project: zsxh/Coursera_NLP_MC

def main():
    train_data_filename = 'parse_train.dat'
    train_rare_filename = 'p1.train.rare.dat'
    pcfg_model_filename = 'parser_train.counts.out'

    pcfg = PCFG()
    for l in open(train_data_filename):
        t = json.loads(l)
        pcfg.count(t)
    pcfg.count_word()

    process_rare_words(open(train_data_filename), open(train_rare_filename,
                                                       'w'), pcfg.rare_words,
                       rare_words_rule_p1)

    new_pcfg = PCFG()
    for l in open(train_rare_filename):
        t = json.loads(l)
        new_pcfg.count(t)
    new_pcfg.cal_rule_params()

    new_pcfg.write(open(pcfg_model_filename, 'w'))

Example #7

0

Show file

File: p2.py Project: xunyuw/Coursera_NLP_MC

def train(train_data_filename, train_rare_filename, pcfg_model_filename,
          rare_words_rule):
    print 'train PCFG model'
    pcfg = PCFG()
    for l in open(train_data_filename):
        t = json.loads(l)
        pcfg.count(t)
    pcfg.count_word()

    print 'process rare word'
    process_rare_words(open(train_data_filename), open(train_rare_filename,
                                                       'w'), pcfg.rare_words,
                       rare_words_rule)

    print 'train PCFG model again'
    new_pcfg = PCFG()
    for l in open(train_rare_filename):
        t = json.loads(l)
        new_pcfg.count(t)
    new_pcfg.cal_rule_params()

    new_pcfg.write(open(pcfg_model_filename, 'w'))
    return new_pcfg

Example #8

0

Show file

File: learn.py Project: Upasna29/RecipeTransformer

def build_model():
    pcfg = PCFG()
    if exists(MODEL):
        pcfg.load_model(MODEL)
    
    else:
        print "Building the Grammar Model"
        start = time()
        
        if not exists(TEMP_DIR):
            makedirs(TEMP_DIR)
        
        # Normalise the treebanks
        if not exists(QUESTIONBANK_NORM):
            normalize_questionbank(QUESTIONBANK_DATA, QUESTIONBANK_PENN_DATA)
            gen_norm(QUESTIONBANK_NORM, [QUESTIONBANK_PENN_DATA])
        
        if not exists(PENNTREEBANK_NORM):
            gen_norm(PENNTREEBANK_NORM, glob(PENNTREEBANK_GLOB))
        
        # Keep a part of the treebanks for testing
        i = 0
        with open(MODEL_TREEBANK, 'w') as model, open(TEST_DAT, 'w') as dat, open(TEST_KEY, 'w') as key:
            for treebank in [QUESTIONBANK_NORM, PENNTREEBANK_NORM]:
                for tree in open(treebank):
                    i += 1
                    if (i % 100) == 0:
                        sentence, n = get_sentence(loads(tree))
                        if n > 7 and n < 20:
                            dat.write(sentence+'\n')
                            key.write(tree)
                        else:
                            i -= 1
                    
                    model.write(tree)
        
        # Learn PCFG
        pcfg.learn_from_treebanks([MODEL_TREEBANK])
        pcfg.save_model(MODEL)
        print "Time: (%.2f)s\n" % (time() - start)
    
    return pcfg

Example #9

0

Show file

    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols,
                       self.PCFG.freq_tokens)

        #note : if the id of a symbol is above self.PCFG.nb_tags,
        #it's an artificial symbol introduced with Chomsky normalization
        self.symbol_to_id = {
            symbol: i
            for (i, symbol) in enumerate(self.PCFG.list_all_symbols)
        }

        #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids:
        #we store rules with an additional hierarchical level for speed up
        #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ)
        #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries
        self.grammar_ids = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand symbol of the grammar rule
            idx_root_tag = self.symbol_to_id[root_tag]
            self.grammar_ids[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items(
            ):  #split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.symbol_to_id[split[0]]
                idx_right_tag = self.symbol_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_ids[idx_root_tag] = dico

        #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ?
        #this is what stores self.lexicon_inverted
        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

Example #10

0

Show file

def run(args):

    data = loader.load_treebanks(TREEBANK_PATH)
    train_data, dev_data, test_data = loader.train_test_split(
        data, 0.8, 0.1, 0.1)
    words, embeddings = loader.load_word_embeddings(EMBEDDING_PATH)

    pcfg = PCFG(train_data)
    pcfg.train(train_data)
    pcfg.set_oov(OOV, words, embeddings)

    if args.generate_output:
        output = pcfg.generate_output(test_data)

    if args.evaluation:
        accs, nb_no_parse = pcfg.predict(test_data[:2])

    if args.parse:
        corpus = []
        with open(args.txt_path, 'r') as f:
            corpus = f.read().split('\n')
        pcfg.parse_from_txt(corpus)

Example #11

0

Show file

File: main.py Project: moallafatma/Probabilistic-Context-Free-Grammar-Parser-1

def run(args):

    has_effect = False

    if args:
        try:

            train_corpus, val_corpus, test_corpus = data.get_train_val_test()
            words, embeddings = data.get_polyglot_words_embeddings()

            parser = PCFG()
            parser.learn_probabilities_and_rules(train_corpus)
            parser.set_oov_module(OovModule, words, embeddings)

            if args.inference:

                get_gold(parser, test_corpus, filename='evaluation_data.gold')
                get_predictions(parser,
                                test_corpus,
                                filename='evaluation_data.parser_output')

            if args.evaluation:
                evaluation('evaluation_data.gold',
                           'evaluation_data.parser_output')

            if args.parse:
                parser.parse_from_txt(args.txt_path)

        except Exception as e:
            logger.exception(e)
            logger.error("Uhoh, the script halted with an error.")
    else:
        if not has_effect:
            logger.error(
                "Script halted without any effect. To run code, use command:\npython3 main.py <args>"
            )

Example #12

0

Show file

File: parser1.py Project: eightypercentcoffee/cky

def multi_f(sentence):
    grammar_file = sys.argv[1]
    pcfg = PCFG()
    pcfg.load_model(grammar_file)
    parser = Parser(pcfg)
    return parser.parse(sentence)

if __name__ == "__main__":

    if len(sys.argv) != 2:
        print("usage: python3 parser.py GRAMMAR")
        exit()

    start = time()
    grammar_file = sys.argv[1]
    print("Loading grammar from " + grammar_file + " ...", file=stderr)    
    pcfg = PCFG()
    pcfg.load_model(grammar_file)
    parser = Parser(pcfg)

    print("Parsing sentences ...", file=stderr)

    with Pool(processes = os.cpu_count()) as pool:
        trees = pool.map(multi_f, stdin.readlines())

    for t in trees:
        print(dumps(t))

    print("Time: (%.2f)s\n" % (time() - start), file=stderr)

Example #13

0

Show file

from pcfg import PCFG
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--corpus", help="training treebank corpus", type=str)
parser.add_argument("--sentences", help="raw token sentences", type=str)
parser.add_argument("--outfile", help="name of the output file", type=str)
args = parser.parse_args()

grammar = PCFG(args.corpus)
grammar.parse_corpus()
grammar.predict(args.sentences, args.outfile)

Example #14

0

Show file

        sys.exit(1)

    # load the train file to trees
    trees = []
    f = open(trainfilename, 'r')
    for line in f:
        trees.append(nltk.Tree.fromstring(line))

    # preprocss the tree forms: ignore functional labels and binarize to CNF
    for tree in trees:
        # ignore_func_labels(tree)
        tree.chomsky_normal_form(horzMarkov=2)
        # tree.chomsky_normal_form()

    # learn PCFG
    lexicon, grammar, vocabulary, symbols = PCFG(trees)
    # print(grammar)

    # for OOV
    oovwords = OOV(embedfilename, vocabulary)

    # parse new sentences using CYK based on learned PCFG
    # parser = CYKSolver(lexicon, grammar, vocabulary, symbols, oovwords)

    # i = 0
    for line in sys.stdin:
        # print('start parse')
        # print(line)
        # start = time.time()
        # if line == '\n': continue
        # cyksolver = CYK(line.split(), lexicon, grammar, vocabulary, symbols, embedfilename)

Example #15

0

Show file

File: parser1.py Project: eightypercentcoffee/cky

def multi_f(sentence):
    grammar_file = sys.argv[1]
    pcfg = PCFG()
    pcfg.load_model(grammar_file)
    parser = Parser(pcfg)
    return parser.parse(sentence)

Example #16

0

Show file

if args.fnc == "generate":
    print "### GENERATING WORDS ###"
    print "model:", args.model, args.n 
    print "language:", args.lang
    if args.model == "nphone":
        lm = NgramModel(args.n, corpus, 1)
    elif args.model == "nsyll":
        if args.lex.startswith("celexes/syll"):
            lm = NsyllModel(args.n, corpus, 1)
        else:
            print "Use syll__ file for this model"
            sys.exit()
    elif args.model == "pcfg":
        if args.lex.startswith("celexes/pcfg"):
            print call(["./pcfg/io","-d","1","-g", args.grammar, args.lex],stdout = open('grammars/gram_pcfg.wlt', 'w'))
            lm = PCFG('grammars/gram_pcfg.wlt')
            corpus = [re.sub(" ","",x) for x in corpus]
        else:
            print "Use pcfg__ file for this model"
            sys.exit()
    lm.create_model(corpus, args.smoothing) 
    o = "Lexicons/lex_" + args.lex.split("/")[-1][:-4] + "_cv" +  str(args.cv) + "_iter" + str(args.iter) + "_m" + args.model + "_n" + str(args.n) + "_smoothing" + str(args.smoothing) + ".txt"
    lexfile = write_lex_file(o, corpus, args.cv, args.iter, lm, args.h**o)
    print "null lexicons wrote on", lexfile
    print "### WRITING RESULTS ###"
    write_all(lexfile, args.graph, args.lang)


else: 
    o = "evaluation/eval_" + args.lex.split("/")[-1][:-4] + "_cv" +  str(args.cv) + "_iter" + str(args.iter) + "_m" + args.model  + "_n" + str(args.n) + "_smoothing" +  str(args.smoothing)+ ".txt"
    out = open(o, 'w')

Example #17

0

Show file

File: main.py Project: zw76859420/nlp_learning

# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# -------------------------------------------#
# main.py    	                             #
# author: sean lee                           #
# qq: 929325776							     #
# email: [email protected]                    #
#--------------------------------------------#

from pcfg import PCFG 

parser = PCFG()
parser.fit('./corpus/toy/train.txt')
parser.parse("the man saw the dog")
'''
print(parser.N_dict)
print(parser.NR_dict)
print(parser.TR_dict)
'''