Exemple #1
0
def pcfg(train_idx=None, smoothing=None):
    """
    productions = []
    item = treebank._fileids[0]
    print("ITEM\n\n",item,"\n\n")
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)
        tree.chomsky_normal_form(horzMarkov = 2)
        productions += tree.productions()
"""
    if train_idx == None:
        train_idx = (len(treebank.fileids()) * 3) // 4
    productions = []
    for item in treebank.fileids()[0:train_idx]:
        for tree in treebank.parsed_sents(item):
            tree.collapse_unary(
                collapsePOS=False)  # Remove unary production rule
            tree.chomsky_normal_form(
                horzMarkov=2
            )  # Convert into chomsky normal form i.e., A->(B,C,D) into A->(B,E) E->(C,D)
            productions += tree.productions()

    S = Nonterminal('S')
    if smoothing == None:
        grammar = learn_pcfg(S, productions)
    elif smoothing == 'L1':
        grammar = smoothing_pcfg(S, productions)

    with open('grammar.pkl', 'wb') as f:
        pickle.dump(grammar, f)

    return grammar
def main():
    # print(nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0])
    # nltk.corpus.treebank.parsed_sents('wsj_0001.mrg')[0].draw()

    # print("Induce PCFG grammar from treebank data:")
    #
    productions = []
    print(len(treebank.fileids()))
    for item in treebank.fileids(): # Goes through all trees
      for tree in treebank.parsed_sents(item):
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        productions += tree.productions()
    # #
    # # print(type(productions[0]))
    # #
    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # # # print(grammar)    # This is a PCFG
    # pickle.dump(grammar, open("tbank-grammar.p", "wb"))
    # t = time.time()
    # grammar = pickle.load(open("tbank-grammar.p", "rb"))
    # textf = open("lexicon.txt", "w")
    # n = textf.write(str(reduce(lambda a, b: a + "\n" + b, list(filter(lambda x: "'" in x, str(grammar).split("\n"))))))
    # textf.close()
    # print(time.time()-t)
    parser = ViterbiParser(grammar)
    # pickle.dump(parser, open("cky-parser.p", "wb"))
    # parser = pickle.load(open("cky-parser.p", "rb"))
    parser.trace(0)
    sent = "John will join the board"
    tokens = sent.split()

    try:
        grammar.check_coverage(tokens)
        print("All words covered")
        parses = parser.parse_all(tokens)
        if parses:
            lp = len(parses)
            print(lp)
            print(parses[0].label())
            # parses[0].draw()
            p = reduce(lambda a,b:a+b.prob(), list(filter(lambda x: x.label() == 'S', parses)), 0.0)
        else:
            p = 0

        print("Probability:", p)
    except:
        print("Some words not covered")
Exemple #3
0
def sentences():
    for f in treebank.fileids():
        for t in treebank.parsed_sents(f):
            t.chomsky_normal_form(horzMarkov=1)
            t.collapse_unary(collapsePOS=True)

            yield (t, t.leaves())
Exemple #4
0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.fileids()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*' * 70)
Exemple #5
0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk import stem
    from nltk.corpus import treebank

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.fileids()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = " ".join(stemmed)
    results = re.sub(r"(.{,70})\s", r"\1\n", results + " ").rstrip()

    # Convert the original to a string, and word wrap it.
    original = " ".join(orig)
    original = re.sub(r"(.{,70})\s", r"\1\n", original + " ").rstrip()

    # Print the results.
    print("-Original-".center(70).replace(" ", "*").replace("-", " "))
    print(original)
    print("-Results-".center(70).replace(" ", "*").replace("-", " "))
    print(results)
    print("*" * 70)
Exemple #6
0
def test_GrammarParser():
    import nltk
    from nltk.corpus import treebank
    grammar = r"""NP:
    {<DT>*(<NN>|<NNP>|<NNS>)+}          # Chunk everything
    }<VBD|IN>+{      # Chink sequences of VBD and IN
    """
    #     tree=treebank.parsed_sents('wsj_0001.mrg')[0]
    #     print tree
    grammar_VP = r"""VP:
    {<VBZ><VP>}
    """
    #     tree=nltk.RegexpParser(grammar).parse(treebank.parsed_sents('wsj_0001.mrg')[0].pos())
    #     print tree
    fileids = treebank.fileids()

    #     for fileld in fileids:
    for i in range(len(fileids)):
        if i > 10:
            break


#         trees=treebank.parsed_sents(fileld)
        trees = treebank.parsed_sents(fileids[i])
        for tree in trees:
            tree_Gram = nltk.RegexpParser(grammar).parse(tree)
            for subtree in tree_Gram.subtrees():
                if subtree.label() == "VP":
                    print subtree
def gen_corpus(path, threshold):
    """
    src: http://www.nltk.org/_modules/nltk/tree.html
    corpora from wsj_0001.mrg to wsj_0199.mrg
    e.g.: t = treebank.parsed_sents('wsj_0001.mrg')[0]
    to visualize a tree: t.draw()
    :param path: save to path
    :param threshold: minimum length of a sentence to keep
    :return: none
    """
    boundaries = []
    sentences = []
    for t in treebank.parsed_sents(treebank.fileids()):
        flat = _flatten_tree(t, threshold)
        if flat:
            boundaries.append(flat)
            sentence = ' '.join(t.leaves()).translate(PUNC_TRANS).lower()
            sentence = re.sub(r' +', ' ', sentence)
            # replace digit(s) as 'x'(s)
            sentences.append(re.sub(r'\d', 'x', sentence).strip())
    _check_length_match(boundaries, sentences)
    with open(path + "/boundaries.txt", 'w') as f:
        f.write('1'.join(boundaries))
    with open(path + "/sentences.txt", 'w') as f:
        f.write(' '.join(sentences))
def trial_run_with_treebank():
    files = treebank.fileids()
    files = files[:100] # make shorter for setup
    gram = makeGrammarFromTreebank(files)
    myparser = cky_parser.ckyparser(gram,Nonterminal('S'))
    chart,mytrees = myparser.probabilistic_parse_from_sent("I saw John with my telescope")
    print(mytrees)
    def load_data(self, percentage):
        print("Started Loading the Data")
        # Get the complete data
        data_set = treebank.fileids()
        # Partition the data into train and test data sets
        training_data_fileIds = [file for file in data_set if "wsj_00" in str(file)]
        testing_data_fileIds = [file for file in data_set if "wsj_01" in str(file)]

        # How much percentage of files consider for training?
        index = int(percentage*len(training_data_fileIds))
        training_data_fileIds = training_data_fileIds[:index]

        tagged_training_data = treebank.tagged_sents(fileids=training_data_fileIds)
        tagged_testing_data = treebank.tagged_sents(fileids=testing_data_fileIds)

        tagged_training_words = treebank.tagged_words(fileids=training_data_fileIds)
        tagged_testing_words = treebank.tagged_words(fileids=testing_data_fileIds)

        # print(len(tagged_training_data1), len(tagged_testing_data1))

        # UnTag the data for other uses
        untagged_training_data = [untag(item) for item in tagged_training_data]
        untagged_testing_data = [untag(item) for item in tagged_testing_data]

        print("Data Loaded Successfully. Stats are")
        print("Training Data Sentences: ", len(tagged_training_data))
        print("Testing Data  Sentences: ", len(tagged_testing_data))

        return tagged_training_data, tagged_testing_data, tagged_training_words, tagged_testing_words, untagged_training_data, untagged_testing_data
Exemple #10
0
def sentences():
    for f in treebank.fileids():
        for t in treebank.parsed_sents(f):
            t.chomsky_normal_form(horzMarkov=1)
            t.collapse_unary(collapsePOS=True)

            yield (t, t.leaves())
Exemple #11
0
def demo():
    """
    A demonstration of the porter stemmer on a sample from
    the Penn Treebank corpus.
    """

    from nltk.corpus import treebank
    from nltk import stem

    stemmer = stem.PorterStemmer()

    orig = []
    stemmed = []
    for item in treebank.fileids()[:3]:
        for (word, tag) in treebank.tagged_words(item):
            orig.append(word)
            stemmed.append(stemmer.stem(word))

    # Convert the results to a string, and word-wrap them.
    results = ' '.join(stemmed)
    results = re.sub(r"(.{,70})\s", r'\1\n', results + ' ').rstrip()

    # Convert the original to a string, and word wrap it.
    original = ' '.join(orig)
    original = re.sub(r"(.{,70})\s", r'\1\n', original + ' ').rstrip()

    # Print the results.
    print('-Original-'.center(70).replace(' ', '*').replace('-', ' '))
    print(original)
    print('-Results-'.center(70).replace(' ', '*').replace('-', ' '))
    print(results)
    print('*' * 70)
def read_split_treebank():
    fileids = treebank.fileids()
    seed.shuffle(fileids)
    split = int(len(fileids) * 0.8)
    x_train = fileids[:split]
    x_test = fileids[split:]
    print('Train - Test: ', len(x_train), len(x_test))
    return x_train, x_test
Exemple #13
0
def read_treebank(input_vocab_size=10000, output_vocab_size=10000, seq_len=10):
    all_sents = []
    for fname in treebank.fileids():
        sents = treebank.sents(fname)
        if sents:
            all_sents.extend(sents)

    return read_dataset(all_sents, input_vocab_size, output_vocab_size,
                        seq_len)
 def _induce_grammar(self):
     self.productions = []
     for tree in treebank.parsed_sents(treebank.fileids()):
         # perform optional tree transformations, e.g.:
         tree.collapse_unary(
             collapsePOS=False)  # Remove branches A-B-C into A-B+C
         tree.chomsky_normal_form(
             horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
         self.productions += tree.productions()
Exemple #15
0
def read_data():
    treebank_tagged_sents = list(
        chain(*[[tree.pos() for tree in treebank.parsed_sents(pf)] for pf in treebank.fileids()]))

    words_list = [[tag[0] for tag in sent] for sent in treebank_tagged_sents]
    labels = [[tag[1] for tag in sent] for sent in treebank_tagged_sents]

    words = []
    max_words = 0
    for sent in words_list:
        words.extend(sent)
        max_words = max(max_words, len(sent))

    print("Max. Words:", max_words)

    seq_length = 100

    print("Seq. Length:", seq_length)

    words = list(set(words))

    print("Number of Words:", len(words))

    unique_labels = []
    for sent in labels:
        unique_labels.extend(sent)

    unique_labels = list(set(unique_labels))

    print("Number of Unique Labels:", len(unique_labels))

    word2id = {word: i + 1 for i, word in enumerate(words)}
    id2word = {i + 1: word for i, word in enumerate(words)}

    X_data = []
    Y_data = []

    for i in range(len(treebank_tagged_sents)):
        for j in range(len(words_list[i])):
            _x = [0] * max_words

            for k in range(j + 1):
                _x[j - k] = word2id[words_list[i][k]]

            _x = _x[:seq_length]
            _x.reverse()

            X_data.append(_x)
            Y_data.append(one_hot(labels[i][j], unique_labels))

    X_data = np.array(X_data, dtype=np.int32)
    Y_data = np.array(Y_data, dtype=np.float32)

    print(X_data.shape)
    print(Y_data.shape)

    return X_data, Y_data, unique_labels, words, word2id, id2word
Exemple #16
0
def get_training_and_test_split(config):
    files = treebank.fileids()
    random.shuffle(files)
    training = files[:int(0.9 * len(files))]
    test = files[int(0.9 * len(files)):]

    with open(config.train_set, 'w') as file:
        file.write(",".join(training))

    with open(config.test_set, 'w') as file:
        file.write(",".join(test))
def getGrammar():

    fileid = treebank.fileids()
    trainfiles = fileid[:160]
    #testfiles=fileid[0.8*len(fileid):]

    productions = []
    for item in trainfiles:
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(
                collapsePOS=False)  # Remove branches A-B-C into A-B+C
            tree.chomsky_normal_form(
                horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
            productions += tree.productions()

    lhs_prod = [p.lhs() for p in productions]
    rhs_prod = [p.rhs() for p in productions]
    set_prod = set(productions)

    list_prod = list(set_prod)

    token_rule = []
    for ele in list_prod:
        if ele.is_lexical():
            token_rule.append(ele)

    set_token_rule = set(p.lhs() for p in token_rule)
    list_token_rule = list(set_token_rule)
    corr_list_token_rule = []
    for word in list_token_rule:
        if str(word).isalpha():
            corr_list_token_rule.append(word)
            continue
    #print(corr_list_token_rule)

    import nltk
    a = []
    for tok in corr_list_token_rule:
        #lhs = nltk.grammar.Nonterminal('UNK')
        lhs = 'UNK'
        rhs = [u'UNK']
        UNK_production = nltk.grammar.Production(lhs, rhs)
        lhs2 = nltk.grammar.Nonterminal(str(tok))
        a.append(nltk.grammar.Production(lhs2, [lhs]))

    token_rule.extend(a)

    list_prod.extend(a)

    S = Nonterminal('S')
    grammar = induce_pcfg(S, list_prod)
    return grammar
Exemple #18
0
def ptb_wsj_set():
    trainsize = 3500
    testsize = 300

    ts = list(
        itertools.chain(*(treebank.parsed_sents(fid)
                          for fid in treebank.fileids())))[:trainsize +
                                                           testsize]
    testsents = [" ".join(s.leaves()) for s in ts[trainsize:]]

    # with open("/tmp/trees.json", 'w') as f:
    # 	lins = []
    # 	for t in ts[:trainsize]:
    # 		def lin(t):
    # 			if isinstance(t, str):
    # 				label = t
    # 				cs = []
    # 			else:
    # 				label = t.label()
    # 				cs = t
    #
    # 			if '"' in label:
    # 				raise Exception("»\"« occuring in data! :O")
    #
    # 			return "{\"label\": \"" + label + "\", \"children\": [" + ",".join([lin(c) for c in cs]) + "]}"
    # 		lins.append(lin(t))
    # 	print("[", ",\n".join(lins), "]", file = f)
    #
    # with open("/tmp/test.txt", 'w') as f:
    # 	for s in testsents:
    # 		print(s, file = f)

    ts = ts[:trainsize]

    def parse_treebank_grammar(treat_nt):
        def getrules(t):
            lhs = treat_nt(t.label())
            rhs = []
            for x in t:
                if isinstance(x, str):
                    rhs.append(x)
                else:
                    rhs.append(treat_nt(x.label()))
                    for r in getrules(x):
                        yield r
            yield (lhs, tuple(rhs))

        return normalize_rules(itertools.chain(*(getrules(t) for t in ts)))

    rules, ntdict = intify_prules(parse_treebank_grammar)
    #test = ["the old man", "something was .", "something was at the stock market today .", "Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 ."]
    return (rules, ntdict), testsents
def greedy_backward_selection():
    extractor = funtag_features.extract_features
    # funtag_features.availbale_features = []

    best_f_score = 0.0
    all_features = ['label','head_pos','head','yeild','alt_head','alt_pos', 'parent_labels']
        # , 'grandmother_label','sister_labels','sister_poss','sister_head']
    best_features = list(all_features)
    
    td_files, test_files = train_test_split(treebank.fileids(),
                                            train_size=0.8,
                                            random_state=0)
    train_files, dev_files = train_test_split(td_files,
                                              train_size=0.8,
                                              random_state=0)
    while True:
        f_score_feature = []
        for f in all_features:
            # funtag_features.availbale_features.append(f)
            if f in best_features:
                best_features.remove(f)

            X_train, Y_train = read_treebank_files(train_files, extractor, best_features)
            classifier = train_scikit_classifier(X_train, Y_train)
            X_eval, Y_eval = read_treebank_files(dev_files, extractor, best_features)
            Y_out = classifier.predict(X_eval)
            print('Done training and evaluating with features:', best_features)
            stats = {'*ALL*': PRF()}
            all_tags = sorted(set(t for ts in (Y_eval + list(Y_out))
                      for t in to_taglist(ts)))
            for tag in all_tags:
                stats[tag] = PRF()
            compute_statistics(Y_eval, Y_out, stats)
            f_score_feature.append((stats['*ALL*'].f1(),f))
            best_features.append(f)
        if best_f_score < max(f_score_feature)[0] :
            best_f_score = max(f_score_feature)[0]
            best_features.remove(max(f_score_feature)[1])
            all_features.remove(max(f_score_feature)[1])
        else:
            break
        if len(all_features) == 0:
            break
    print('Done with greedy_forward_selection..... ')
    print('Features are: ', best_features)
    print('Overall statistics:')
    print('     #corr #guess  #true precision  recall  F-score')
    print('      {0: >4}   {1: >4}   {2: >4}   {3:0.4f}   {4:0.4f}   {5:0.4f}'.format(*stats['*ALL*'].all()))
    print('Statistics for each function tag:')
    print('tag  #corr #guess  #true precision  recall  F-score')
    for label in all_tags:
        print('{0}   {1: >4}   {2: >4}   {3: >4}   {4:0.4f}   {5:0.4f}   {6:0.4f}'.format(label, *stats[label].all()))
def get_trees_sentences():
    trees = []
    sentences = []
    for file in treebank.fileids():
        for tree in treebank.parsed_sents(file):
            tree_str = str(tree)
            trees.append(tree_str)
        for sentence in treebank.sents(file):
            s = ""
            s = " ".join(words for words in sentence)
            sentences.append(s)
    assert len(trees) == len(sentences)
    sentences = list(map(lambda x: x.lower(), sentences))
    return (trees, sentences)
Exemple #21
0
def test_Phrase():
    import nltk
    from nltk.corpus import treebank
    fileids = treebank.fileids()
    grammar = r"""
    ADVP:{<RB>(<CC>*<RB>*|<JJ>*)}
    {}
    """
    for fileld in fileids:
        sents = treebank.tagged_sents(fileld)
        for sent in sents:
            tree_Gram = nltk.RegexpParser(grammar).parse(sent)
            for subtree in tree_Gram.subtrees():
                if subtree.label() == "ADVP":
                    print subtree
Exemple #22
0
def train():
    files = tb.fileids()
    data = list(tb.parsed_sents(files))

    # 80:20 split
    split = int(len(data) * 0.8)
    train_data = data[:split]
    test_data = data[split:]

    P_grammar, P_non_terms, P_vocab, P_term_parents, P_parents_count = pcfg.pcfg(
        train_data)

    total_precision = 0
    toal_recall = 0
    total_f1_score = 0
    i = 0
    for test in test_data:
        print('Test', i)
        i += 1
        try:
            words = test.leaves()
            scores, backs = cky_parsing(words, copy(P_grammar),
                                        copy(P_non_terms), copy(P_vocab),
                                        copy(P_term_parents),
                                        copy(P_parents_count))
            start = Tree(Nonterminal('S'), [])
            if scores[0][len(words)][Nonterminal('S')] == 0:
                start = get_start(scores, len(words))
            predicted_tree = build_tree(start, 0, len(words), backs,
                                        P_non_terms)
            clean_tree(predicted_tree)
            predicted_tree.un_chomsky_normal_form()
            precision, recall, f1_score = evaluate(words, predicted_tree, test)
            print(precision, recall, f1_score)
            total_precision += precision
            toal_recall += recall
            total_f1_score += f1_score
        except:
            print('***************Failed', i - 1)
            continue

    total_precision /= len(test_data)
    toal_recall /= len(test_data)
    total_f1_score /= len(test_data)

    print('Precision', total_precision)
    print('Recall', toal_recall)
    print('F1_score', total_f1_score)
Exemple #23
0
def getProductions(pos):
    from nltk.corpus import treebank
    fileids = treebank.fileids()
    pos_ProductionList = []
    for fileld in fileids:
        trees = treebank.parsed_sents(fileld)
        for tree in trees:
            productions = tree.productions()
            for production in productions:
                if str(production.lhs()) == pos:
                    pos_ProductionList.append(production)
    c_POS = Counter(pos_ProductionList)
    c_POS_sorted = sorted(c_POS.iteritems(),
                          key=lambda asd: asd[1],
                          reverse=True)
    print c_POS_sorted
def PCFG_Section():
    toy_pcfg1 = PCFG.fromstring("""
        S -> NP VP [1.0]
        NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
        Det -> 'the' [0.8] | 'my' [0.2]
        N -> 'man' [0.5] | 'telescope' [0.5]
        VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
        V -> 'ate' [0.35] | 'saw' [0.65]
        PP -> P NP [1.0]
        P -> 'with' [0.61] | 'under' [0.39]
    """)

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', pcfg_prod)
    print('pcfg_prod.lhs()  =>', pcfg_prod.lhs())
    print('pcfg_prod.rhs()  =>', pcfg_prod.rhs())
    print('pcfg_prod.prob() =>', pcfg_prod.prob())

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    for item in treebank.fileids()[:2]:
      for tree in treebank.parsed_sents(item):
        # print(" ".join(tree.leaves()))
        # perform optional tree transformations, e.g.:
        # tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        # tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        prods = tree.productions()
        # print(prods[0].prob())
        productions += prods

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    # print(grammar)    # This is a PCFG

    ### Parsing section below ###

    print("\nParse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(1)

    sent = treebank.parsed_sents('wsj_0001.mrg')[0]
    print(sent.prob())
Exemple #25
0
def test_VBPenn():
    from nltk.corpus import treebank
    fileids = treebank.fileids()
    VBContexts = []
    for i in range(len(fileids)):
        sentPOSList = treebank.tagged_sents(fileids[i])
        for sentPOS in sentPOSList:
            #             print 'sentPOS',sentPOS
            VBContext = getVBContext(sentPOS)
            #             print 'VBContext',VBContext
            if len(VBContext) != 0:
                POSList = ''
                for wordPOS in VBContext:
                    #                     print 'wordPOS',wordPOS
                    POSList += wordPOS[1] + '+'
                VBContexts.append((POSList, VBContext))
    print VBContexts
Exemple #26
0
def get_words():
    """
        Returns list of words from nltk treebank
    """
    import nltk

    nltk.download("treebank")
    from nltk.corpus import treebank

    word_ls = []
    for item in treebank.fileids():
        for (word, tag) in treebank.tagged_words(item):
            # assuming the words are allready lowered
            word = word.lower()
            word_ls.append(word)

    word_ls = list(set(word_ls))
    return word_ls
Exemple #27
0
def get_trees(fileids=None, verbose=False):
	""" 
	Get the CNF trees for the treebank fileids given, or for the entire treebank
	"""
	if not fileids:
		# Get the Penn Treebank corpus
		fileids = treebank.fileids()

	# Get the sentence-trees in each file
	tree_lists = [treebank.parsed_sents(file_id) for file_id in fileids]
	trees = [sent for sent_list in tree_lists for sent in sent_list]
	if verbose:
		print("obtained", len(trees), "trees from the corpus.")

	cnf_trees = [ctc.convert_tree(t) for t in trees]
	if verbose:
		print("converted", len(trees), "trees to cnf.")

	return cnf_trees
Exemple #28
0
def make_PCFG_grammar():
    '''
    Forms a PCFG grammar utilizing the first 1963 files
    in the WSJ treebank
    '''
    # Save a list of all produced PCFG rules given the tested data
    PCFG_rules = []
    for item in treebank.fileids()[:1964]:
        # We want to first get rid of all non-binary branchings of the tree
        for tree in treebank.parsed_sents(item):
            tree.collapse_unary(collapsePOS = False)
            tree.chomsky_normal_form(horzMarkov = 2)
            PCFG_rules += tree.productions()

    # Induce the PCFG grammar
    S = Nonterminal('S')
    PCFG_grammar = induce_pcfg(S, PCFG_rules)

    return PCFG_grammar
Exemple #29
0
def parse(sent):
    files = tb.fileids()
    data = list(tb.parsed_sents(files))

    P_grammar, P_non_terms, P_vocab, P_term_parents, P_parents_count = pcfg.pcfg(
        data)

    words = casual_tokenize(str(sent))
    scores, backs = cky_parsing(words, copy(P_grammar), copy(P_non_terms),
                                copy(P_vocab), copy(P_term_parents),
                                copy(P_parents_count))
    start = Tree(Nonterminal('S'), [])
    if scores[0][len(words)][Nonterminal('S')] == 0:
        start = get_start(scores, len(words))
    predicted_tree = build_tree(start, 0, len(words), backs, P_non_terms)
    clean_tree(predicted_tree)
    predicted_tree.un_chomsky_normal_form()
    print('Parsed Tree')
    print(predicted_tree)
Exemple #30
0
def buildpcfg():  # call this function once then pickle the PCFG
    productions = []
    for item in treebank.fileids():
        for tree in treebank.parsed_sents(item):
            # perform optional tree transformations, e.g.:
            tree.collapse_unary(collapsePOS=False)  # Remove branches A-B-C into A-B+C
            tree.chomsky_normal_form(horzMarkov=2)  # Remove A->(B,C,D) into A->B,C+D->D
            productions += tree.productions()
            ''' this just overwrites everything to 'X' apparently
            for production in productions: #rewrite every production to use the universal tagset
                if isinstance(production.lhs(), Nonterminal) and production.lhs().symbol() != 'S':
                    production._lhs = Nonterminal(tag.map_tag('en-ptb','universal',production.lhs().symbol()))
                for elem in enumerate(production.rhs()):
                    if isinstance(elem, Nonterminal):
                        production._rhs[elem] = Nonterminal(tag.map_tag('en-ptb','universal', production.rhs()[elem].symbol()))
                        '''

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)  # induce PCFG from productions learned in treebank
    # print(grammar)
    return grammar
Exemple #31
0
def test_PennCorpus():
    from nltk.corpus import treebank
    fileids = treebank.fileids()
    ADJP_ProductionList = []
    ADVP_ProductionList = []
    NP_ProductionList = []
    VP_ProductionList = []
    for fileld in fileids:
        trees = treebank.parsed_sents(fileld)
        for tree in trees:
            productions = tree.productions()
            for production in productions:
                if str(production.lhs()) == 'ADJP':
                    ADJP_ProductionList.append(production)
                if str(production.lhs()) == 'ADVP':
                    ADVP_ProductionList.append(production)
                if str(production.lhs()) == 'NP':
                    NP_ProductionList.append(production)
                if str(production.lhs()) == 'VP':
                    VP_ProductionList.append(production)

    print len(ADJP_ProductionList), len(ADVP_ProductionList), len(
        NP_ProductionList), len(VP_ProductionList)
    c_ADJP = Counter(ADJP_ProductionList)
    c_ADVP = Counter(ADVP_ProductionList)
    c_NP = Counter(NP_ProductionList)
    c_VP = Counter(VP_ProductionList)
    c_ADJP_sorted = sorted(c_ADJP.iteritems(),
                           key=lambda asd: asd[1],
                           reverse=True)
    c_ADVP_sorted = sorted(c_ADVP.iteritems(),
                           key=lambda asd: asd[1],
                           reverse=True)
    c_NP_sorted = sorted(c_NP.iteritems(),
                         key=lambda asd: asd[1],
                         reverse=True)
    c_VP_sorted = sorted(c_VP.iteritems(),
                         key=lambda asd: asd[1],
                         reverse=True)
Exemple #32
0
def train_pcfg():
    print 'training grammar'
    productions = []
    # print len(treebank.fileids())
    trees = []
    # up to 199 less for shorter grammar for quicker training
    for fileid in treebank.fileids()[0:20]:
        for tree in treebank.parsed_sents(fileid):
            # perform optional tree transformations, e.g.:
            # Remove branches A->B->C into A->B+C so we can avoid infinite
            # productions
            tree.collapse_unary(collapsePOS=False)
            # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser)
            # horizontal and vertical Markovization: remember parents and siblings in tree
            #     This gives a performance boost, but makes the grammar HUGE
            #     If we use these we would need to implement a tag forgetting method
            #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0)
            tree.chomsky_normal_form()
            productions += tree.productions()
    S = nltk.Nonterminal('S')
    grammar = nltk.induce_pcfg(S, productions)
    print "grammar trained!"
    return grammar
Exemple #33
0
def main():
    # train = treebank.fileids()[:190]
    test = treebank.fileids()[190:]  # 10 last sentences

    # original grammar
    # pcfg = induce_grammar(train)
    # pickle.dump(pcfg, open("grammar.pcfg", 'wb'))

    # load grammar
    # pcfg : PCFG = pickle.load(open("grammar.pcfg", 'rb'))

    # fill in missing words
    # missing_words = get_missing_words(pcfg, test)
    # pcfg_unk = fill_missing_words(pcfg, missing_words)

    # pickle.dump(pcfg_unk, open("grammar_unk.pcfg", 'wb'))

    # load unk grammar
    pcfg_unk: PCFG = pickle.load(open("grammar_unk.pcfg", 'rb'))

    # use unk grammar on test sentences
    parser = ViterbiParser(pcfg_unk)
    parse_treebank(parser, test)
Exemple #34
0
def train_pcfg():
    print 'training grammar'
    productions = []
    # print len(treebank.fileids())
    trees = []
    # up to 199 less for shorter grammar for quicker training
    for fileid in treebank.fileids()[0:20]:
        for tree in treebank.parsed_sents(fileid):
            # perform optional tree transformations, e.g.:
            # Remove branches A->B->C into A->B+C so we can avoid infinite
            # productions
            tree.collapse_unary(collapsePOS=False)
            # Remove A->(B,C,D) into A->B,C+D->D (binarization req'd by CKY parser)
            # horizontal and vertical Markovization: remember parents and siblings in tree
            #     This gives a performance boost, but makes the grammar HUGE
            #     If we use these we would need to implement a tag forgetting method
            #tree.chomsky_normal_form(horzMarkov = 0, vertMarkov=0)
            tree.chomsky_normal_form()
            productions += tree.productions()
    S = nltk.Nonterminal('S')
    grammar = nltk.induce_pcfg(S, productions)
    print "grammar trained!"
    return grammar
def lab1():
    # set this to your own feature extraction function

    extractor = funtag_features.extract_features
        
    # We reserve some treebank files for testing purposes.
    # This shouldn't be touched until you have optimized your
    # results on the development set.
    td_files, test_files = train_test_split(treebank.fileids(),
                                            train_size=0.8,
                                            random_state=0)

    # Split the rest into a training and a development set.
    train_files, dev_files = train_test_split(td_files,
                                              train_size=0.8,
                                              random_state=0)

    print('Reading training trees from treebank...')
    X_train, Y_train = read_treebank_files(train_files, extractor,[])

    print('Training classifier...')
    classifier = train_scikit_classifier(X_train, Y_train)

    print('Done training.')

    print('Reading evaluation trees from treebank...')
    X_eval, Y_eval = read_treebank_files(dev_files, extractor,[])

    # When you have optimized your system for the development set,
    # you can evaluate on the test set.
    #X_eval, Y_eval = read_treebank_files(test_files, extractor, [])

    print('Running classifier on evaluation data...')

    Y_out = classifier.predict(X_eval)

    print_stats(Y_eval, Y_out)
Exemple #36
0
def perplexity():
    '''
    Give the PCFG and the parser used, run the parser on
    the rest of the treebank and calculates the perplexity
    of the model given the testing sentences.
    '''

    PCFG_grammar = make_PCFG_grammar()
    parser = ViterbiParser(PCFG_grammar)
    all_p = []
    skipped_sentence = 0

    for item in treebank.fileids()[1964:]:
        trees = treebank.parsed_sents(item)
        for tree in trees:
            tree = tree.leaves()
            try:
                PCFG_grammar.check_coverage(tree)
                for parse in parser.parse(tree):
                    parse_string = str(parse)
                    p = re.search(r"p=([^/]+)", parse_string).group(1)
                    p = p[:-1]
                    all_p.append(float(p))
            except:
                skipped_sentence += 1
                continue

    perplexity = 1
    N = float(len(all_p))
    for p in all_p:
        perplexity = perplexity * (1/p)
    perplexity = pow(perplexity, 1/float(N))

    print("Perplexity:", perplexity)
    print("All parse probabilities:", all_p)
    print("Skipped sentences:", skipped_sentence)
    print("PCFG grammar:", PCFG_grammar)
Exemple #37
0
def main():

    #getting all treebank files
    files = treebank.fileids()

    #grammar will be stored in dict
    grammar = {}
    for file in files:
        trees = treebank.parsed_sents(file)

        for tree in trees:
            grammar = traverse_tree(tree, grammar)

    cfg = get_grammar_series(grammar)

    cfg.to_csv("cfg_instances.csv")

    cfg.to_pickle('cfg_pickle.pickle')

    pcfg = get_probabilities(grammar)

    pcfg_series = get_grammar_series(pcfg)

    pcfg_series.to_csv('problem_1_pcfg.csv')
files = []

def find_pronouns(tree):
    pronouns = []
    for child in tree:
        if type(child) in [unicode, str] and child.lower() in PRONOUNS:
            pronouns.append((child.lower(), None))

        if isinstance(child, ParentedTree):
            pronouns = pronouns + find_pronouns(child)

    return pronouns

total = 0
for file in treebank.fileids():
    stats['name'] = file
    for tree in treebank.parsed_sents(file):
        tree = ParentedTree.convert(tree)
        for pronoun, np_node in find_pronouns(tree):
            if pronoun in gendered:
                stats['gendered'] += 1
            if pronoun in itits:
                stats['itits'] += 1
            stats['total'] += 1
            total += 1
            stats['pct_gendered'] = stats['gendered']/float(stats['total'])
    print file, total


    files.append(stats.copy())
from nltk import ParentedTree, corpus
from nltk.corpus import treebank, names

import Queue

from collections import deque
from sys import argv

FILENAMES = [file for file in treebank.fileids()[:20]]


PRONOUNS = {'he': 'male', 'him': 'male', 'his': 'male', 'himself': 'male',
            'she': 'female', 'her': 'female', 'hers': 'female', 'herself': 'female',
            'they': 'plural', 'them': 'plural', 'their': 'plural', 'theirs': 'plural', 'themselves': 'plural',
            'it': 'singular', 'its': 'singular', 'itself': 'singular'}

PRONOUN_RESULTS = {'male': 0, 'male_total':0, 'male_pct':0, 'female': 0, 'female_total': 0, 'female_pct':0,'neutral': 0,
            'neutral_total': 0, 'neutral_pct':0, 'they': 0, 'they_total': 0, 'they_pct':0, 'reflexive': 0, 'reflexive_total': 0,
                   'reflexive_pct':0,}

NAMELIST = ([(name, "male") for name in names.words("male.txt")] +
           [(name, "female") for name in names.words("female.txt")])

NOMINALS = {'NN': 'singular', 'NNS': 'plural', 'NNP': 'singular', 'NNPS': 'plural', 'PRP': 'singular'}

def update_pronoun_results(pronoun, correct):
    if pronoun in ['he', 'him', 'his', 'himself']:
        if correct:
            PRONOUN_RESULTS['male'] += 1
        else:
            PRONOUN_RESULTS['male_total'] += 1
Exemple #40
0
       etiquette.
    """

    return [(word, "NN") for word in sent]



# EVALUATION

import nltk
from nltk.corpus import treebank

training_size = 150
development_size = 25

training_data = list(treebank.tagged_sents(treebank.fileids()[:training_size]))
development_data = list(treebank.tagged_sents(treebank.fileids()[training_size : training_size + development_size]))

def flatten(lol):
    """A partir d'une liste des listes des valeurs, produit une liste des
       simple valeurs en concatenant les listes."""
    return [value for l in lol for value in l]

def filter_text(corpus):
    return [[(word, tag) for (word, tag) in sent if tag != '-NONE-'] \
            for sent in corpus]

def measure_accuracy():
    """Mesure l'efficacite de votre approche en comptant la proportion
       des mots pour lesquels la bonne etiquette ete recuperee par votre
       algorithme."""
Exemple #41
0
			if mytree:
				print("parsed!", mytree)
				original_trees += [tree]
				resulting_trees += [mytree]
			else:
				print("didn't parse")
				unsuccessful_parses += 1

		if original_trees:
			# If we got some successful parses, evaluate these:
			accuracy = evalb.evalb(resulting_trees, original_trees)
			count = len(original_trees)

			if verbose:
				print("In fold", i, "we parsed", count, "sentences with an accuracy of", accuracy)

			# Update the count and averages
			avg_accuracy = (avg_accuracy*parsed_num + accuracy*count)
			parsed_num += count
			avg_accuracy /= parsed_num

		unparsed_num += unsuccessful_parses

	# Aggregate results.
	if verbose:
		print("We parsed", parsed_num, "sentences with an accuracy of", avg_accuracy)
		print("We could not parse", unparsed_num, "sentences")
	return avg_accuracy

print(cross_validate(fileids=treebank.fileids()[:25], verbose=True))
Exemple #42
0
        else:
            for i in tree:
                topresent(i, tree)


def halflemmatize(word):
    if word in ['fell', 'fallen']:
        return 'fall'
    if word in ['is', 'was']:
        return 'is'
    elif word in ['are', 'were', '\'re']:
        return 'are'
    elif word in ['\'m', 'am']:
        return 'am'
    elif word in ['\'S', '\'s']:
        return 'is'
    out = wnl.lemmatize(word.lower(), 'v')
    return out


if __name__ == "__main__":
    count = 0
    for file in treebank.fileids():
        for i in treebank.parsed_sents(file):
            print(i.leaves())
            if i.leaves()[0] == 'These' and i.leaves()[1] == 'three':
                print(i)
            tofuture(i, Tree('None', []))
            print(i.leaves())
            count += 1