Beispiel #1
0
def CKY_parser():
    '''
    Given the PCFG, we use the built in CKY praser function
    to get a sentence's most probable parse
    '''
    PCFG_grammar = make_PCFG_grammar()
    # Utilize the ViertabiParser given the PCFG grammar induction rules
    parser = ViterbiParser(PCFG_grammar)

    # Sample sentence parse
    sentences = treebank.parsed_sents('wsj_1964.mrg')

    skipped_sentences = 0

    # A for loop to print out the full parse
    for sentence in sentences:
        sentence = sentence.leaves()
        try:
            PCFG_grammar.check_coverage(sentence)
            for parse in parser.parse(sentence):
                print(parse)
        except:
            skipped_sentences += 1
            continue

    print("Total skipped sentences:", skipped_sentences)
Beispiel #2
0
def evaluate_sentence(sentence: string, grammar: PCFG):
    sentence = sentence.split()
    print(sentence, flush=True)
    pos = [pos for word, pos in pos_tag(sentence)]
    print(pos, flush=True)
    parser = ViterbiParser(grammar, trace=0)
    for line in accumulate(pos, lambda total, token: total + ' ' + token):
        line = line.split()
        print(line)
        print([tree.prob() for tree in list(parser.parse(line))], flush=True)
Beispiel #3
0
def main():
    data = pd.read_csv(data_file_path)
    data = data.drop(columns=["Unnamed: 0"])

    (sentence, sentence_tokens) = readsentence()  # take input from user and save text, tokenized text

    if os.path.exists('mytagger.pkl'):
        # try to open a previously saved tagger
        input = open('mytagger.pkl', 'rb')
        mytagger = load(input)
        input.close()
    else:
        # no such tagger is found so train/save it
        mytagger = traintagger()
        output = open('mytagger.pkl', 'wb')
        dump(mytagger, output, -1)
        output.close()
    tagged_tokens = mytagger.tag(sentence_tokens)
    print(tagged_tokens)

    if os.path.exists('mypcfg.pickle'):
        # try to open a previously saved PCFG
        input = open('mypcfg.pickle', 'rb')
        mypcfg = load(input)
        input.close()
    else:
        # no such PCFG exists, so induce/save it
        mypcfg = buildpcfg()
        output = open('mypcfg.pickle', 'wb')
        dump(mypcfg, output)
        output.close()
    try:
        tree = sequence_matching(tagged_tokens)
        print("Sequence matching was used")

    except:
        parser = ViterbiParser(mypcfg)
        tree = parser.parse(tagged_tokens)
        print("Vitberi parser was used")
    finally:
        if not isinstance(tree, Tree):
            Tree.pretty_print(next(tree))  # do something to print it out, or print error message if input couldn't be parsed
        else: print(tree)

    df2 = {'sentence': sentence, 'sentence tokens': sentence_tokens, 'tagged tokens': tagged_tokens}
    data = data.append(df2, ignore_index=True)
    data.to_csv(data_file_path)

    print("Previous data:")
    print(data)
Beispiel #4
0
    def parse_sentence(self, sent):
        """
        Parse sent using induced grammar
        Visualize the most likely parse tree for sent
        :return: None. Save parsing results to pcfg.txt
        """
        if self.grammar is None:
            raise ValueError("PCFG hasn't been induced yet.")
        # other parser option(s): e.g., parser = pchart.InsideChartParser(self.grammar)
        parser = ViterbiParser(self.grammar)
        parser.trace(3)

        # http://www.nltk.org/api/nltk.parse.html
        sys.stdout = open('pcfg.txt', 'w')
        parses = parser.parse(sent)
        for parse in parses:
            print(parse)
            # visualize the tree:
            print(parse.draw())
Beispiel #5
0
def perplexity():
    '''
    Give the PCFG and the parser used, run the parser on
    the rest of the treebank and calculates the perplexity
    of the model given the testing sentences.
    '''

    PCFG_grammar = make_PCFG_grammar()
    parser = ViterbiParser(PCFG_grammar)
    all_p = []
    skipped_sentence = 0

    for item in treebank.fileids()[1964:]:
        trees = treebank.parsed_sents(item)
        for tree in trees:
            tree = tree.leaves()
            try:
                PCFG_grammar.check_coverage(tree)
                for parse in parser.parse(tree):
                    parse_string = str(parse)
                    p = re.search(r"p=([^/]+)", parse_string).group(1)
                    p = p[:-1]
                    all_p.append(float(p))
            except:
                skipped_sentence += 1
                continue

    perplexity = 1
    N = float(len(all_p))
    for p in all_p:
        perplexity = perplexity * (1/p)
    perplexity = pow(perplexity, 1/float(N))

    print("Perplexity:", perplexity)
    print("All parse probabilities:", all_p)
    print("Skipped sentences:", skipped_sentence)
    print("PCFG grammar:", PCFG_grammar)
Beispiel #6
0
def do_cky(grammar):
    global test
    global posx
    
    viterbi = ViterbiParser(grammar) # inicializa o parser com a gramatica (PCFG)
    resultados = []
    for t in test[:1]: # para cada sentenca da base de teste
        try:
            sent = t.leaves() # pega a frase

            if len(sent) <= 18: # filtro para palavras com até 18 palavras (incluindo pontuação)
                ssent = []
                for s in sent: # checar a cobertura da gramatica para cada palavra
                    try:
                        grammar.check_coverage([s])
                        ssent.append(s)
                    except ValueError: # para os casos de palavras desconhecidas
                        ssent.append("UNK")

                saida = []
                for i in viterbi.parse(ssent): # utiliza o parser para a sentenca de teste
                    saida.append(i)

                # lista para o resultado das duas vertentes: descobrir os não-terminais que original os terminais; e identificar os não-terminais que derivam as palavras
                list_eval_val = []
                list_eval_test = []
                list_tag_val = []
                list_tag_test = []
                
                posx = 0
                make_tree_evaluation(saida[0][0],list_eval_test,list_tag_test,0) # realiza a avalicao para o resultado do parser
                posx = 0
                make_tree_evaluation(t,list_eval_val,list_tag_val, 0) # realiza a avalicao para a arvore da base de teste
                
                # ordena pela ordem de visitacao
                list_eval_test.sort(key=lambda tup: tup[3])
                list_eval_val.sort(key=lambda tup: tup[3])

                # quantidade de acertos
                acertos = len(set(list_eval_test).intersection(set(list_eval_val)))
                # labeled precision
                lp = acertos/len(list_eval_test)
                # labeled recall
                lr = acertos/len(list_eval_val)
                # f1
                f1 = 0
                if lp > 0 and lr > 0:
                    f1 = 2*lp*lr/(lp+lr)
                # tagging accuracy
                ta = 0
                ta = len([i for i, j in zip(list_tag_test, list_tag_val) if i == j])
                ta /= len(list_tag_val)
                
                # armazena o resultado
                r = {'lp':lp, 'lr': lr, 'f1':f1, 'ta':ta}
                resultados.append(r)
            else:
                print("Sentença com mais de 18 palavras.")
        except Exception:
            print("Árvore mal formada.")

    # realiza o calculo da media para cada metrica
    media_lp = sum(item['lp'] for item in resultados)/len(resultados)
    media_lr = sum(item['lr'] for item in resultados)/len(resultados)
    media_f1 = sum(item['f1'] for item in resultados)/len(resultados)
    media_ta = sum(item['ta'] for item in resultados)/len(resultados)
    print("media_lp",media_lp,"media_lr",media_lr,"media_f1",media_f1,"media_ta",media_ta)
Beispiel #7
0
    for production in t.productions():
        tbank_productions.append(production)

print tbank_productions[2]

#Secara otomatis membangun grammar (terutama menghitung probability rule)
#dari list production rule tbank_productions
tbank_grammar = induce_pcfg(Nonterminal('S'), tbank_productions)

print tbank_grammar

#PARSING
parser = ViterbiParser(tbank_grammar)
s = time.time()
#parsing untuk raw data latih kedua
for t in parser.parse(raw_test_set[1]):
    print(t)

#hitung waktu parsing
s = time.time() - s

#gold standard dari dataset kedua
print test_set[1]
'''Tugas anda adalah membangun fungsi untuk mengukur akurasi dari parser 
yang telah dibangun. Akurasi terdiri dari 2 macam, yaitu exact match, 
dan partial match (rata-rata recall dan precision). Cari sendiri bagaimana
menghitung recall dan precision untuk parsing dari referensi yang valid.

Setelah dibangun fungsi untuk menghitung 2 jenis akurasi tersebut, 
lakukan skenario di bawah ini :
Beispiel #8
0
from nltk import induce_pcfg
from nltk.parse import pchart
from nltk.parse import ViterbiParser
from nltk.treetransforms import *
from nltk import *

productions = []

for item in treebank._fileids:
    length=int(len(item)*0.9)
    for tree in treebank.parsed_sents(item)[:length]:
        tree.collapse_unary(collapsePOS = False)
        tree.chomsky_normal_form(horzMarkov = 2)
        productions += tree.productions()

S = Nonterminal('S')
grammar = induce_pcfg(S, productions)
parser = pchart.InsideChartParser(grammar)
parserv= ViterbiParser(grammar2)


for item in treebank._fileids:
    start=int(len(item)*0.9)
    for tree in treebank.parsed_sents(item)[start:]:
        sent = tree.leaves()
        print(tree.pos())
        for parse in parser.parse(sent):
            print(parse) 
        for parse in parserv.parse(sent):
            print(parse) 
Beispiel #9
0
parser = nltk.parse.EarleyChartParser(grammar)
for t in parser.parse(tokens):
    print(t)

# In[ ]:

### CYK parser gets the most probable parse
from nltk.parse import ViterbiParser

parser = ViterbiParser(grammar)
parser.trace(3)
parsed_sent = list(parser.parse_all(tokens))  # to convert generator to list
parsed_sent[0].draw()
for t in parsed_sent:
    print(t)

# In[ ]:

### CYK parser gets the most probable parse
from nltk.parse import ViterbiParser

parser = ViterbiParser(grammar)
parser.trace(3)
parsed_sent = list(parser.parse(tokens))  # to convert generator to list
parsed_sent[0].draw()
for t in parsed_sent:
    print(t)

# In[ ]: