Beispiel #1
0
def preprocess(y):
    treetransforms.collapse_unary(y, collapsePOS=True)
    treetransforms.chomsky_normal_form(y, horzMarkov=2, vertMarkov=1)
    traverse_tree(y)
    # we got the modified tree so we need to calculate the scores
    change_labels(y)
    y._label = 4
Beispiel #2
0
def tree2matrix(tree, cnf=True):
    if cnf:
        cnf_tree = deepcopy(tree)
        treetransforms.chomsky_normal_form(cnf_tree)
    else:
        cnf_tree = deepcopy(tree)

    node_label = set([])
    leaf_label = set([])
    leaves = cnf_tree.leaves()
    leaves_position = []
    for i in range(len(leaves)):
        leaves_position.append(cnf_tree.leaf_treeposition(i))
    matrix = []
    for i in range(len(leaves)):
        list_i = ['<pad>'] * len(leaves)
        leaf_i = leaves_position[i]
        for k in range(len(leaf_i) - 1, -1, -1):
            if set(leaf_i[k:]) == set([0]):
                tree_at_k = cnf_tree[leaf_i[:k]]
                label_k = tree_at_k.label()
                if k == len(leaf_i) - 1:
                    leaf_label.add(label_k + "_leaf_label")
                else:
                    node_label.add(label_k + "_node_label")
            list_i[i + len(tree_at_k.leaves()) - 1] = label_k
        matrix.append(list_i)
    node_label.add('<pad>')
    leaf_label.add('<pad>')
    node_label.remove('<pad>')
    leaf_label.remove('<pad>')
    return leaves, matrix, node_label, leaf_label
def get_productions(treebank):
    """
    Returns CNF rules of grammar as derived from the input treebank.
    Rules are stored in dictionary with LHS:[RHS, RHS, RHS...] format
    and the [RHS] list DOES include duplicates for future probability 
    calculations. Also returns start symbol of grammar. 

    :param: treebank: list where each entry is a str value representing a tree
    from the input treebank file 
    """
    
    # grab the top of the tree/start symbol for the grammar
    # use first tree treebank[0], split on whitespace, use root node [0], omit "(" [1:]
    start = treebank[0].split()[0][1:]

    rules = {}
    for sentence in treebank:
        # converting the tree to CNF using NLTK magic 
        t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True)
        collapsed_tree = deepcopy(t)
        treetransforms.collapse_unary(collapsed_tree)
        cnf_tree = deepcopy(collapsed_tree)
        treetransforms.chomsky_normal_form(cnf_tree)

        # using more NLTK magic to grab the production rules from each tree 
        for raw_rule in cnf_tree.productions():
            rule = str(raw_rule)
            production = rule.split(" -> ")
            if production[0] not in rules:
                rules[production[0]] = [production[1]]
            else:
                rules[production[0]].append(production[1])
        # now rules contains LHS:[RHS, RHS, RHS...] pairs
    return rules, start
Beispiel #4
0
def check_binary(tree_string):
    tree = Tree.fromstring(tree_string)
    cnfTree = deepcopy(tree)
    treetransforms.chomsky_normal_form(cnfTree)
    original = ' '.join(str(tree).split())
    binary = ' '.join(str(cnfTree).split())
    assert original == binary, f'Different: {original} ####### {binary}'
Beispiel #5
0
    def get_grammar(cls, train_trees, starting_symb='SENT'):
        """
        This method returns a the grammar coputed from the training set.

        Inputs:
        -------

        train_trees (list): List of trees to perform training
        startting_symbol (str): The root symbol
        """
        productions = []

        # Chmosky Normal Form
        for tree in train_trees:
            
            # Remove unary rules
            treetransforms.collapse_unary(tree)

            # Transform to CNF
            treetransforms.chomsky_normal_form(tree, horzMarkov=2)

            # Copute production and store is
            productions += tree.productions()

        # Define the root symbol
        SENT = Nonterminal(starting_symb)

        # Compute the grammar using PCFG
        grammar = induce_pcfg(SENT, productions)

        grammar.chomsky_normal_form()

        return grammar
Beispiel #6
0
def trained_pcfg():
  try:
    with open("pcfgcache.pkl",'rb') as input:
      print("Loading the PCFG...")
      gram = pickle.load(input)
    print("Loaded!")
    return gram
  except FileNotFoundError:
    print("Training the PCFG...")
    ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ 
    'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg',
    cat_file='allcats.txt', tagset='wsj')
    productions = []
    tb = treebank
    # Search for nltk_data/corpora/ptb and place all the wsj/XX/*.mrg files in 
    useFullTreeBank = True
    n = 0                   # check progress of training
    if useFullTreeBank:
      tb = ptb
    for t in tb.parsed_sents(): 
      if n % 200 == 0:
        print(n)
      collapse_unary(t,True)
      chomsky_normal_form(t)
      n = n + 1
      for p in t.productions():
        productions.append(p)
    gram = grammar.induce_pcfg(grammar.Nonterminal('S'), productions)
    print("Trained!")
    print("Writing the PCFG...")
    with open("pcfgcache.pkl",'wb') as output:
      pickle.dump(gram, output, -1)
    print("Write successful!")
    return gram
Beispiel #7
0
def prepapre_sentiment_data():
    with open("../data/rt-polarity.neg.txt", mode="r") as f:
        neg_sent = f.readlines()

    with open("../data/rt-polarity.pos.txt", mode="r") as f:
        pos_sent = f.readlines()

    sents_label = [0] * len(neg_sent) + [1] * len(pos_sent)

    trees = []
    labels = []

    count = 0
    for sent, label in zip(neg_sent + pos_sent, sents_label):
        try:
            count += 1
            if count % 200 == 0:
                print(count)
            a = list(parser.raw_parse(sent))
            hytree = a[0]
            chomsky_normal_form(hytree)
            trees.append(hytree[0])
            labels.append(label)
        except:
            continue

    with open("hynguyen.pickle", mode="wb") as f:
        pickle.dump((trees, labels), f)
Beispiel #8
0
def generate_data_v2(tree, cnf=True):
    if cnf:
        cnfTree = deepcopy(tree)
        treetransforms.chomsky_normal_form(cnfTree)
        try:
            pad_cnf_tree = deepcopy(cnfTree)
        except RecursionError as e:
            print(f'Error copy tree')
            raise e
        padding_leaves(pad_cnf_tree)
    else:
        pad_cnf_tree = deepcopy(tree)

    bf_tree, bf_lst_tree, bf_meta = bft(pad_cnf_tree)
    input_node = []
    input_label = []
    input_index = []
    leaves_location = [
        pad_cnf_tree.leaf_treeposition(i)
        for i in range(len(pad_cnf_tree.leaves()))
    ]
    for i in range(len(bf_lst_tree)):
        if len(bf_tree[i].leaves()) > 1:
            if '|' in bf_tree[i].label():
                input_node.append("SPLIT_NODE_node_label")
                input_label.append("<pad>")
            else:
                input_node.append(bf_tree[i].label() + "_node_label")
                input_label.append('<pad>')
        else:
            input_label.append(bf_tree[i].label() + "_leaf_label")
            try:
                input_node.append(bf_tree[i].leaves()[0][7:])
            except IndexError as e:
                # print(f'index {i}, {len(bf_tree)}, {len(bf_lst_tree)}')
                # tree.pretty_print()
                # # cnfTree.pretty_print()
                # print('pad_cnf_tree......')
                # pad_cnf_tree.pretty_print()
                # print('pad_cnf_tree --- separate.....')
                # print(input_node)
                # print(f'bf_tree...')
                # for x in bf_tree:
                #     print(x)
                # print(f'bf_lst_tree...')
                # for x in bf_lst_tree:
                #     print(x)
                # print('Searching...')
                # print(bf_tree[i - 1])
                # print(bf_tree[i])
                # print(bf_tree[i + 1])
                # print(bf_tree[i].leaves())
                raise e
        first_leaf = deepcopy(bf_lst_tree[i])
        first_leaf.extend(bf_tree[i].leaf_treeposition(0))
        first_leaf = leaves_location.index(tuple(first_leaf))
        last_leaf = first_leaf + len(bf_tree[i].leaves()) - 1
        input_index.append([first_leaf, last_leaf])
    return input_node, input_label, input_index, bf_tree, bf_lst_tree
Beispiel #9
0
def get_test_dataset(treebank, n):
    test_trees = list()
    for i in range(400, n+400):
        for tree in treebank.parsed_sents()[:i+1]:
            chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')
            test_trees.append(tree)
        if len(test_trees) == 50:
            return test_trees
    return test_trees
Beispiel #10
0
 def chomsky_normal_form(self):
   chomsky_parsed_senteces = []
   for parsed_sentence in self.parsed_sentences:
     try:
       tree = deepcopy(parsed_sentence)
       treetransforms.collapse_unary(tree)
       cnfTree = deepcopy(tree)
       treetransforms.chomsky_normal_form(cnfTree)
       chomsky_parsed_senteces.append(cnfTree)
     except Exception:
       pass
   self.parsed_sentences = chomsky_parsed_senteces
Beispiel #11
0
def demo():
    """
    A demonstration showing how each tree transform can be used.
    """

    from copy import deepcopy

    from nltk import tree, treetransforms
    from nltk.draw.tree import draw_trees

    # original tree from WSJ bracketed text
    sentence = """(TOP
  (S
    (S
      (VP
        (VBN Turned)
        (ADVP (RB loose))
        (PP
          (IN in)
          (NP
            (NP (NNP Shane) (NNP Longman) (POS 's))
            (NN trading)
            (NN room)))))
    (, ,)
    (NP (DT the) (NN yuppie) (NNS dealers))
    (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
    (. .)))"""
    t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True)

    # collapse subtrees with only one child
    collapsedTree = deepcopy(t)
    treetransforms.collapse_unary(collapsedTree)

    # convert the tree to CNF
    cnfTree = deepcopy(collapsedTree)
    treetransforms.chomsky_normal_form(cnfTree)

    # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two
    parentTree = deepcopy(collapsedTree)
    treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1)

    # convert the tree back to its original form (used to make CYK results comparable)
    original = deepcopy(parentTree)
    treetransforms.un_chomsky_normal_form(original)

    # convert tree back to bracketed text
    sentence2 = original.pprint()
    print(sentence)
    print(sentence2)
    print("Sentences the same? ", sentence == sentence2)

    draw_trees(t, collapsedTree, cnfTree, parentTree, original)
Beispiel #12
0
def demo():
    """
    A demonstration showing how each tree transform can be used.
    """

    from nltk.draw.tree import draw_trees
    from nltk import tree, treetransforms
    from copy import deepcopy

    # original tree from WSJ bracketed text
    sentence = """(TOP
  (S
    (S
      (VP
        (VBN Turned)
        (ADVP (RB loose))
        (PP
          (IN in)
          (NP
            (NP (NNP Shane) (NNP Longman) (POS 's))
            (NN trading)
            (NN room)))))
    (, ,)
    (NP (DT the) (NN yuppie) (NNS dealers))
    (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
    (. .)))"""
    t = tree.Tree.parse(sentence, remove_empty_top_bracketing=True)

    # collapse subtrees with only one child
    collapsedTree = deepcopy(t)
    treetransforms.collapse_unary(collapsedTree)

    # convert the tree to CNF
    cnfTree = deepcopy(collapsedTree)
    treetransforms.chomsky_normal_form(cnfTree)

    # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two
    parentTree = deepcopy(collapsedTree)
    treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1)

    # convert the tree back to its original form (used to make CYK results comparable)
    original = deepcopy(parentTree)
    treetransforms.un_chomsky_normal_form(original)

    # convert tree back to bracketed text
    sentence2 = original.pprint()
    print sentence
    print sentence2
    print "Sentences the same? ", sentence == sentence2

    draw_trees(t, collapsedTree, cnfTree, parentTree, original)
Beispiel #13
0
def pcfg_learn(treebank, n):
    productions = list()
    for i in range(n):
        for tree in treebank.parsed_sents()[:i+1]:
            chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')
            prod_gen = tree_to_productions(tree)
            tree_to_append = next(prod_gen)
            while tree_to_append:
                productions.append(tree_to_append)
                try:
                    tree_to_append = next(prod_gen)
                except Exception as e:
                    tree_to_append = False
    productions = get_productions(productions)
    return PCFG(Nonterminal('S'), productions)
Beispiel #14
0
    def get_trees(cls, path_to_dataset, train_split=0.8):
        """
        This methods returns the train, test and eval set as list of trees

        Inputs:
        -------

        path_to_dataset (str): The path to the copus to be split
        train_split (float): Proportion of training
        """
        sentences = []
        print('Collecting training, test and evaluation trees')
        with open(path_to_dataset) as f:

            for sentence in f:
                # Removes functional labels
                sent = re.sub(r'-\w+\ ', " ", sentence)
                sentences.append(sent.rstrip())

        # Split Train / (Test + Eval)
        train_sent, test_sent = train_test_split(sentences, train_size=train_split, test_size=1 - train_split, shuffle=False)

        # Split Test / Eval
        eval_sent, test_sent = train_test_split(test_sent, train_size=0.5, shuffle=False)

        print(f'The total number of sentences {len(sentences)}')
        print(f'Number of train sentences {len(train_sent)} -- {round(100 * len(train_sent)  / len(sentences), 1)} %')
        print(f'Number of test sentences {len(test_sent)} -- {round(100 * len(test_sent)  / len(sentences), 1) } %')
        print(f'Number of evaluation sentences {len(eval_sent)} -- {round(100 * len(eval_sent)  / len(sentences), 1)} %')


        train_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in train_sent]
        test_trees  = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in test_sent]
        eval_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in eval_sent]

        for (test, ev) in zip(test_trees, eval_trees):

            # Remove unary rules
            treetransforms.collapse_unary(test)
            treetransforms.collapse_unary(ev)

            # Transform to CNF
            treetransforms.chomsky_normal_form(test, horzMarkov=2)
            treetransforms.chomsky_normal_form(ev, horzMarkov=2)


        return train_trees, eval_trees, test_trees
Beispiel #15
0
    def extract_information(ori_bin_line):
        # try:
        label = get_label(line)
        sentence = ' '.join(Tree.fromstring(line).flatten())

        parsed = list(get_corenlp_parser().parse_text(sentence))[0]
        cnfTree = deepcopy(parsed)
        treetransforms.chomsky_normal_form(cnfTree)
        pad_cnf_tree = deepcopy(cnfTree)
        padding_leaves(pad_cnf_tree)

        binary_tree = clean_node(pad_cnf_tree)

        # pad_cnf_tree
        tree = remove_atnodeset_single_nodeset(binary_tree)
        tree = remove_leaf_label(tree)
        tree_string = ' '.join(str(tree).split())

        # tree_string_replace = re.sub(r'\(\w+\s', ' ( ', tree_string)
        # NP|<JJ-NN>
        tree_string_replace = re.sub(r'\([\w<>\|\.:\'`$,-]+\s', ' ( ',
                                     tree_string)
        # json.loads()
        try:
            tree_enc = get_tree_encodings(tree_string_replace)
        except Exception as e:
            # print(tree.)
            tree.pretty_print()
            # clean_node(tree).pretty_print()
            print(f'get_tree_encodings for line')
            print(line)
            print(tree_string)
            print(tree_string_replace)
            print()
            tree_from_string(line).pretty_print()
            sentiment_tree_enc = get_tree_encodings(line)
            print(sentiment_tree_enc)
            raise e

        obj = {
            "label": label,
            "sentence": sentence,
            "constituency_tree_encoding": tree_enc
        }
        return obj
Beispiel #16
0
def generate_all_productions(records, productions_file):

    productions = []
    tree = []
    for line in records:
        if len(line) > 2:
            tree = Tree.fromstring(line)  #generate tree from string
            parentTree = deepcopy(tree)
            treetransforms.chomsky_normal_form(
                parentTree, horzMarkov=2, vertMarkov=3
            )  #parent annotation (one level) and horizontal smoothing of order two
            productions = productions + parentTree.productions(
            )  #get tree productions and append to list

    with open(productions_file,
              'w') as prod_write:  #write productions to a file
        prod_write.write('%start ' + str(tree.label()) +
                         '\n')  # explicitly set the start marker
        for prod in productions:
            prod_write.write(str(prod) + "\n")
    print("Production rules generated")
Beispiel #17
0
def pcfg_cnf_learn(treebank, n):
    for tree in treebank.parsed_sents()[:n]:
        tree = filter_tree(tree)
        chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')
        yield tree
Beispiel #18
0
    print x_axis
    print y_axis
    import matplotlib.pyplot as plt
    plt.title("Accuracy per distance")
    plt.scatter(x_axis, y_axis, c="blue", marker='*', label="accuracy index")
    plt.scatter(x_axis_labeled, y_axis_labeled, c="red", marker='o', label="accuracy label", alpha=0.5)
    plt.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=2, mode="expand", borderaxespad=0.)
    plt.show()

if __name__ == '__main__':
    treebank = LazyCorpusLoader('treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg')
    trees = treebank.parsed_sents()

    trees = trees[:5]
    cleaned_trees = [filter_tree(tree) for tree in trees]
    for t in cleaned_trees:
        chomsky_normal_form(tree, factor='right', horzMarkov=1, vertMarkov=1, childChar='|', parentChar='^')

    parser, pcfg = get_parser(cleaned_trees)
    eval_trees(cleaned_trees, parser, pcfg)

    print "----------- Reporting Per Label -----------"
    print ACCURACY_PER_LABEL
    print len(ACCURACY_PER_LABEL)
    for item in ACCURACY_PER_LABEL:
        print item, "--- total -------> ", ACCURACY_PER_LABEL[item]['total']
        print item, "--- precision ---> ", ACCURACY_PER_LABEL[item]['matches']/float(ACCURACY_PER_LABEL[item]['total'])
    print '&'*100

    print ACCURACY_PER_DISTANCE
    calculate_accuracy_per_distance()
Beispiel #19
0
        neg_sent.append(f.readline())
        neg_sent.append(f.readline())

    with open("data/rt-polarity.pos.txt",mode="r") as f:
        pos_sent.append(f.readline())
        pos_sent.append(f.readline())
        pos_sent.append(f.readline())


    trees = []
    labels = [0]*3 + [1]*3
    sents = pos_sent + neg_sent
    for sent in sents:
        a = list(parser.raw_parse(sent))
        hytree = a[0]
        chomsky_normal_form(hytree)
        trees.append(hytree[0])

    rnn = RecursiveNeuralNetworl(embsize=300,mnb_size=6,wordvector=wordvector)

    trees[0].pretty_print()

    for tree,label in zip(trees,labels):
        root_node, softmax_layer, cost, pred = rnn.forward(tree,label)
        print("correct {0}, predict {1}, cost {2}".format(label,pred,cost))




    # if have_1_child
        # child_have_1_child_and_unicode
Beispiel #20
0
def preprocess_trees_cnf(trees):
    for tree in trees:
        preprocess_tree(tree)
        chomsky_normal_form(tree, factor= 1, horzMarkov=1, vertMarkov=1,childChar='|',parentChar='^')
Beispiel #21
0
def convert_tree_to_cnf(tree):
    treetransforms.chomsky_normal_form(tree)
    for ind in tree.treepositions():
        t = tree[ind]
        if len(t) != 0:
            tree[ind].set_label('0')