Example #1
0
    def test_tree_trees_from_string(self):
        trees = Tree.trees_from_string('(S1(X    (NN junk)))')
        assert len(trees) == 1
        assert str(trees[0]) == '(S1 (X (NN junk)))'

        trees2 = Tree.trees_from_string('''(S1 (S (NP (DT This))
    (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .)))
(S1 (X    (NN junk)))
(S1(X(NN nospace)))

(S1 (NP (DT another)
                                    (JJ junk)

        (NN tree)))
''')
        assert len(trees2) == 4
        assert str(trees2[0]) == sample_tree
        assert str(trees2[1]) == '(S1 (X (NN junk)))'
        assert str(trees2[2]) == '(S1 (X (NN nospace)))'
        assert str(trees2[3]) == '(S1 (NP (DT another) (JJ junk) (NN tree)))'

        trees3 = Tree.trees_from_string('')
        assert len(trees3) == 0

        trees4 = Tree.trees_from_string(sample_tree)
        assert len(trees4) == 1
        assert str(trees4[0]) == sample_tree
Example #2
0
    def test_tree_trees_from_string(self):
        trees = Tree.trees_from_string('(S1(X    (NN junk)))')
        assert len(trees) == 1
        assert str(trees[0]) == '(S1 (X (NN junk)))'

        trees2 = Tree.trees_from_string('''(S1 (S (NP (DT This))
    (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .)))
(S1 (X    (NN junk)))
(S1(X(NN nospace)))

(S1 (NP (DT another)
                                    (JJ junk)

        (NN tree)))
''')
        assert len(trees2) == 4
        assert str(trees2[0]) == sample_tree
        assert str(trees2[1]) == '(S1 (X (NN junk)))'
        assert str(trees2[2]) == '(S1 (X (NN nospace)))'
        assert str(trees2[3]) == '(S1 (NP (DT another) (JJ junk) (NN tree)))'

        trees3 = Tree.trees_from_string('')
        assert len(trees3) == 0

        trees4 = Tree.trees_from_string(sample_tree)
        assert len(trees4) == 1
        assert str(trees4[0]) == sample_tree
Example #3
0
 def test_tree_nltk(self):
     tree = Tree('(S1 (NP (NN NLTK)) (VP (VBZ is) (NP (DT a) (NN dependency))))')
     nltk_tree = tree.as_nltk_tree()
     self.assertEqual(len(nltk_tree), 2)
     self.assertEqual(len(nltk_tree.leaves()), 4)
     self.assertEqual(str(nltk_tree), str(tree))
     self.assertEqual(nltk_tree.label(), 'S1')
     self.assertRaises(ValueError, tree.visualize, 'bad vis method')
Example #4
0
 def test_tree_nltk(self):
     tree = Tree(
         '(S1 (NP (NN NLTK)) (VP (VBZ is) (NP (DT a) (NN dependency))))')
     nltk_tree = tree.as_nltk_tree()
     self.assertEqual(len(nltk_tree), 2)
     self.assertEqual(len(nltk_tree.leaves()), 4)
     self.assertEqual(str(nltk_tree), str(tree))
     self.assertEqual(nltk_tree.label(), 'S1')
     self.assertRaises(ValueError, tree.visualize, 'bad vis method')
Example #5
0
   def test_tree_asciitree(self):
       tree = Tree('(S1 (NP (NN asciitree)) (VP (VBZ is) (NP (DT a) (NN dependency))))')
       self.assertEqual(tree.format_asciitree().strip(), '''S1 
 +-- NP 
 |  +-- NN asciitree
 +-- VP 
    +-- VBZ is
    +-- NP 
       +-- DT a
       +-- NN dependency'''.strip())
Example #6
0
   def test_tree_asciitree(self):
       tree = Tree(
           '(S1 (NP (NN asciitree)) (VP (VBZ is) (NP (DT a) (NN dependency))))'
       )
       self.assertEqual(
           tree.format_asciitree().strip(), '''S1 
 +-- NP 
 |  +-- NN asciitree
 +-- VP 
    +-- VBZ is
    +-- NP 
       +-- DT a
       +-- NN dependency'''.strip())
Example #7
0
    def test_determine_head(self):
        trees = [
            "(S1 (S (WHNP (WP What) (NN year)) (VP (VBD did) (S (NP (DT the) (NNP Titanic)) (VP (VB sink))))))",
            "(S1 (SBARQ (WHADVP (WRB How) (RB far)) (SQ (VBZ is) (NP (PRP it)) (PP (PP (IN from) (NP (NNP Denver))) (PP (TO to) (NP (NNP Aspen))))) (. ?)))",
            "(S1 (SBARQ (WHNP (WP What) (NN county)) (SQ (VBZ is) (NP (NP (NNP Modesto)) (, ,) (NP (NNP California))) (ADVP (RB in))) (. ?)))",
            "(S1 (SBARQ (WHNP (WP Who)) (SQ (VP (VBD was) (NP (NNP Galileo)))) (. ?)))",
            "(S1 (SBARQ (WHNP (WP What)) (SQ (VBZ is) (NP (DT an) (NN atom))) (. ?)))",
            "(S1 (SBARQ (WHADVP (WRB When)) (SQ (VBD did) (NP (NNP Hawaii)) (VP (VB become) (NP (DT a) (NN state)))) (. ?)))",
            "(S1 (SBARQ (WHNP (WRB How) (JJ tall)) (SQ (VP (VBZ is) (NP (DT the) (NNP Sears) (NNP Building)))) (. ?)))",
            "(S1 (S (NP (NNP George) (NNP Bush)) (VP (VBD purchased) (NP (NP (DT a) (JJ small) (NN interest)) (PP (IN in) (WHNP (WDT which) (NN baseball) (NN team))))) (. ?)))",
            "(S1 (SBARQ (WHNP (WP What)) (SQ (VBZ is) (NP (NP (NNP Australia) (POS 's)) (JJ national) (NN flower))) (. ?)))",
            "(S1 (SBARQ (WHADVP (WRB Why)) (SQ (VBZ does) (NP (DT the) (NN moon)) (VP (VB turn) (NP (NN orange)))) (. ?)))",
            "(S1 (SBARQ (WHNP (WP What)) (SQ (VBZ is) (NP (NN autism))) (. ?)))",
        ]

        heads = [
            "year", "it", "county", "Galileo", "atom", "Hawaii", "Building",
            "Bush", "flower", "moon", "autism"
        ]

        assert len(heads) == len(trees)

        chf = SemanticHeadFinder()
        for i in range(len(trees)):
            self.assertEqual(heads[i], chf.determine_head(Tree(trees[i])))
Example #8
0
    def test_tree_errors(self):
        # test issue #33
        self.assertRaises(RuntimeError, Tree, '(())')
        # make sure we can still load good trees after an error
        Tree(sample_tree)
        Tree(sample_tree)
        self.assertRaises(RuntimeError, Tree, '(BADTOPTAG hi)')
        self.assertRaises(RuntimeError, Tree, 'Does not start with a paren')
        self.assertRaises(RuntimeError, Tree, '(S1 eh)')
        self.assertRaises(RuntimeError, Tree, '(S1')
        self.assertRaises(RuntimeError, Tree, '(S1 ((')
        self.assertRaises(RuntimeError, Tree, '(S1 (NP')
        Tree(sample_tree)

        self.assertRaises(TypeError, Tree, 1)
        self.assertRaises(TypeError, Tree, None)
        self.assertRaises(TypeError, Tree, {})
        self.assertRaises(TypeError, Tree, len)
Example #9
0
    def test_tree_modify_tree(self):
        tree = Tree(sample_tree)
        assert str(tree) == sample_tree
        assert str(tree[0][1][0]) == '(VBZ is)'

        tree[0][1][0].label = 'ZZZ'
        assert str(tree[0][1][0]) == '(ZZZ is)'

        tree[0][1][0].token = 'displays'
        assert str(tree[0][1][0]) == '(ZZZ displays)'
        assert str(tree) == '(S1 (S (NP (DT This)) (VP (ZZZ displays) (NP ' \
                            '(DT a) (ADJP (RB fairly) (JJ simple)) (NN ' \
                            'parse) (NN tree))) (. .)))'

        tree[0][1].label_suffix = '-SUFFIX'
        assert tree[0][1].label_suffix == '-SUFFIX'
        assert str(tree[0][1]) == '(VP-SUFFIX (ZZZ displays) (NP (DT a) ' \
                                  '(ADJP (RB fairly) (JJ simple)) (NN ' \
                                  'parse) (NN tree)))'

        self.assertRaises(ValueError, setattr, tree[0], 'token', 'anything')
        self.assertRaises(ValueError, setattr, tree[0][-1], 'token', None)

        tree[0][-1].token = '!'
        assert str(tree) == '(S1 (S (NP (DT This)) (VP-SUFFIX (ZZZ ' \
                            'displays) (NP (DT a) (ADJP (RB fairly) (JJ ' \
                            'simple)) (NN parse) (NN tree))) (. !)))'

        tree[0][1].label_suffix = ''
        assert str(tree) == '(S1 (S (NP (DT This)) (VP (ZZZ ' \
                            'displays) (NP (DT a) (ADJP (RB fairly) (JJ ' \
                            'simple)) (NN parse) (NN tree))) (. !)))'

        # slice testing
        pieces = tree[0][1][0:1]
        assert len(pieces) == 1
        assert str(pieces[0]) == '(ZZZ displays)'

        pieces2 = tree[0][1][-2:]
        assert len(pieces2) == 2
        assert str(pieces2[0]) == '(ZZZ displays)'
        assert str(pieces2[1]) == '(NP (DT a) (ADJP (RB fairly) (JJ ' \
                                  'simple)) (NN parse) (NN tree))'
Example #10
0
    def test_tree_trees_from_file(self):
        import tempfile
        tree_file = tempfile.NamedTemporaryFile('w+t', delete=False)
        print(tree_file)
        print(tree_file.name)
        tree_file.write('''(S1 (S (NP (DT This))
    (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .)))
(S1 (X    (NN junk)))
(S1(X(NN nospace)))

(S1 (NP (DT another)
                                    (JJ junk)

        (NN tree)))
''')
        tree_file.flush()

        trees2 = Tree.trees_from_file(tree_file.name)
        assert len(trees2) == 4
        assert str(trees2[0]) == sample_tree
        assert str(trees2[1]) == '(S1 (X (NN junk)))'
        assert str(trees2[2]) == '(S1 (X (NN nospace)))'
        assert str(trees2[3]) == '(S1 (NP (DT another) (JJ junk) (NN tree)))'
Example #11
0
    def test_tree_trees_from_file(self):
        import tempfile
        tree_file = tempfile.NamedTemporaryFile('w+t', delete=False)
        print(tree_file)
        print(tree_file.name)
        tree_file.write('''(S1 (S (NP (DT This))
    (VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN parse) (NN tree))) (. .)))
(S1 (X    (NN junk)))
(S1(X(NN nospace)))

(S1 (NP (DT another)
                                    (JJ junk)

        (NN tree)))
''')
        tree_file.flush()

        trees2 = Tree.trees_from_file(tree_file.name)
        assert len(trees2) == 4
        assert str(trees2[0]) == sample_tree
        assert str(trees2[1]) == '(S1 (X (NN junk)))'
        assert str(trees2[2]) == '(S1 (X (NN nospace)))'
        assert str(trees2[3]) == '(S1 (NP (DT another) (JJ junk) (NN tree)))'
Example #12
0
from __future__ import print_function
from bllipparser import Tree
from collections import defaultdict
import gzip, sys

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('usage: python create_vocab.py train.gz count')
        sys.exit(0)

    threshold = int(sys.argv[2])
    counts = defaultdict(int)
    for line in gzip.open(sys.argv[1], 'rb'):
        for word in Tree(line).tokens():
            counts[word.lower()] += 1

    for w, c in counts.iteritems():
        if c > threshold:
            print(w)
Example #13
0
import fileinput
from bllipparser import RerankingParser, Tree

if __name__ == '__main__':
    rrp = RerankingParser()
    parser = 'wsj/WSJ-PTB3/parser'
    rrp.load_parser_model(parser)
    for line in fileinput.input():
        tokens = Tree(line).tokens()
        nbest = rrp.parse(tokens)
        print len(nbest)
        for tree in nbest:
            print tree.ptb_parse
Example #14
0
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.  You may obtain
# a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
# License for the specific language governing permissions and limitations
# under the License.

if __name__ == "__main__":
    # needs to be run from the root of the repository

    from bllipparser import RerankingParser, Tree

    rrp = RerankingParser()
    rrp.load_parser_model("first-stage/DATA/EN", terms_only=True)

    tree1 = Tree("""(S1 (INTJ (UH Oh) (JJ sure) (. !)))""")

    tree2 = Tree("""(S1 (FRAG (INTJ (UH Oh) (INTJ (JJ sure))) (. !)))""")

    print tree1.evaluate(tree2)
    print tree2.evaluate(tree1)
Example #15
0
from bllipparser import Tree
import fileinput

for line in fileinput.input():
    tree = Tree(line[:-1])
    for subtree in tree.all_subtrees():
        subtree.label_suffix = ''
    print tree
Example #16
0
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
# License for the specific language governing permissions and limitations
# under the License.

if __name__ == "__main__":
    # needs to be run from the root of the repository for the parser
    # model path below to work

    from bllipparser import RerankingParser, Tree

    rrp = RerankingParser()
    rrp.load_parser_model('first-stage/DATA/EN', heads_only=True)

    tree1 = Tree('''(S1 (SQ (VBZ Swears) (NP (PRP she)) (VP (VBD
    recognized) (NP (PRP$ his) (NN voice)) (, ,) (SBAR (IN that) (S
    (NP (NNP Tim)) (VP (VBD fired)))) (, ,) ('' ') (S (S (NP (PRP It))
    (VP (VBZ 's) (NP (PRP$ my) (NN money)))) (CC and) (S (NP (PRP I))
    (VP (VBP want) (S (NP (PRP it)) (VP (POS '))))))) (. !)))''')

    head = tree1.head()
    print 'head word of sentence:', head.token
    print 'head tree of sentence:', head
    print

    # print all syntactic dependencies
    for goveror, dependent in tree1.dependencies():
        print 'dependency: %s -> %s' % (goveror.token, dependent.token)
    print

    # demo of how to lexicalize a tree by adding the headword to the
    # label of the tree
Example #17
0
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.  You may obtain
# a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
# License for the specific language governing permissions and limitations
# under the License.

if __name__ == "__main__":
    # needs to be run from the root of the repository

    from bllipparser import RerankingParser, Tree

    rrp = RerankingParser()
    rrp.load_parser_model('first-stage/DATA/EN', terms_only=True)

    tree1 = Tree('''(S1 (INTJ (UH Oh) (JJ sure) (. !)))''')

    tree2 = Tree('''(S1 (FRAG (INTJ (UH Oh) (INTJ (JJ sure))) (. !)))''')

    print tree1.evaluate(tree2)
    print tree2.evaluate(tree1)
Example #18
0
    def test_tree_basics(self):
        tree = Tree(sample_tree)
        assert str(tree) == sample_tree
        assert tree.pretty_string() == sample_tree_pretty
        assert tree.tokens() == ('This', 'is', 'a', 'fairly', 'simple',
                                 'parse', 'tree', '.')
        assert tree.tags() == ('DT', 'VBZ', 'DT', 'RB', 'JJ', 'NN',
                               'NN', '.')
        assert tree.tokens_and_tags() == \
            [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('fairly', 'RB'),
             ('simple', 'JJ'), ('parse', 'NN'), ('tree', 'NN'), ('.', '.')]
        assert tree.span() == (0, 8)
        assert tree.label == 'S1'

        subtrees = tree.subtrees()
        assert len(subtrees) == 1
        assert str(subtrees[0]) == '(S (NP (DT This)) (VP (VBZ is) (NP ' \
                                   '(DT a) (ADJP (RB fairly) (JJ simple)) ' \
                                   '(NN parse) (NN tree))) (. .))'
        assert subtrees[0].label == 'S'
        assert str(subtrees[0][0]) == '(NP (DT This))'
        assert subtrees[0][0].label == 'NP'
        assert subtrees[0][0].span() == (0, 1)
        assert subtrees[0][0].tags() == ('DT',)
        assert subtrees[0][0].tokens() == ('This',)
        assert str(subtrees[0][0][0]) == '(DT This)'
        assert subtrees[0][0][0].token == 'This'
        assert subtrees[0][0][0].label == 'DT'
        assert tree[0][0][0].is_preterminal()
        assert len(tree[0]) == 3

        subtrees = iter(tree[0])
        assert str(next(subtrees)) == '(NP (DT This))'
        assert str(next(subtrees)) == '(VP (VBZ is) (NP (DT a) (ADJP ' \
                                      '(RB fairly) (JJ simple)) (NN parse) ' \
                                      '(NN tree)))'
        assert str(next(subtrees)) == '(. .)'

        pairs = [(False, sample_tree),
                 (False, '(S (NP (DT This)) (VP (VBZ is) (NP (DT a) (ADJP '
                         '(RB fairly) (JJ simple)) (NN parse) (NN tree))) '
                         '(. .))'),
                 (False, '(NP (DT This))'),
                 (True, '(DT This)'),
                 (False, '(VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ '
                         'simple)) (NN parse) (NN tree)))'),
                 (True, '(VBZ is)'),
                 (False, '(NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN '
                         'parse) (NN tree))'),
                 (True, '(DT a)'),
                 (False, '(ADJP (RB fairly) (JJ simple))'),
                 (True, '(RB fairly)'),
                 (True, '(JJ simple)'),
                 (True, '(NN parse)'),
                 (True, '(NN tree)'),
                 (True, '(. .)')]
        actual_pairs = [(subtree.is_preterminal(), str(subtree))
                        for subtree in tree.all_subtrees()]
        assert pairs == actual_pairs

        # index into a preterminal
        self.assertRaises(IndexError, lambda: tree[0][0][0][0])
        # index a child that doesn't exist
        self.assertRaises(IndexError, lambda: tree[500])
        self.assertRaises(IndexError, lambda: tree[0][0][7777])
        self.assertRaises(IndexError, lambda: tree[-30])

        # repr shouldn't crash, but we don't check (or rely on) its form
        repr(tree)
        repr(tree[0])
        repr(tree[0][1])
        repr(tree[0][1][0])
Example #19
0
    def test_tree_basics(self):
        tree = Tree(sample_tree)
        assert str(tree) == sample_tree
        assert tree.pretty_string() == sample_tree_pretty
        assert tree.tokens() == ('This', 'is', 'a', 'fairly', 'simple',
                                 'parse', 'tree', '.')
        assert tree.tags() == ('DT', 'VBZ', 'DT', 'RB', 'JJ', 'NN', 'NN', '.')
        assert tree.tokens_and_tags() == \
            [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('fairly', 'RB'),
             ('simple', 'JJ'), ('parse', 'NN'), ('tree', 'NN'), ('.', '.')]
        assert tree.span() == (0, 8)
        assert tree.label == 'S1'

        subtrees = tree.subtrees()
        assert len(subtrees) == 1
        assert str(subtrees[0]) == '(S (NP (DT This)) (VP (VBZ is) (NP ' \
                                   '(DT a) (ADJP (RB fairly) (JJ simple)) ' \
                                   '(NN parse) (NN tree))) (. .))'
        assert subtrees[0].label == 'S'
        assert str(subtrees[0][0]) == '(NP (DT This))'
        assert subtrees[0][0].label == 'NP'
        assert subtrees[0][0].span() == (0, 1)
        assert subtrees[0][0].tags() == ('DT', )
        assert subtrees[0][0].tokens() == ('This', )
        assert str(subtrees[0][0][0]) == '(DT This)'
        assert subtrees[0][0][0].token == 'This'
        assert subtrees[0][0][0].label == 'DT'
        assert tree[0][0][0].is_preterminal()
        assert len(tree[0]) == 3

        subtrees = iter(tree[0])
        assert str(next(subtrees)) == '(NP (DT This))'
        assert str(next(subtrees)) == '(VP (VBZ is) (NP (DT a) (ADJP ' \
                                      '(RB fairly) (JJ simple)) (NN parse) ' \
                                      '(NN tree)))'
        assert str(next(subtrees)) == '(. .)'

        pairs = [(False, sample_tree),
                 (False, '(S (NP (DT This)) (VP (VBZ is) (NP (DT a) (ADJP '
                  '(RB fairly) (JJ simple)) (NN parse) (NN tree))) '
                  '(. .))'), (False, '(NP (DT This))'), (True, '(DT This)'),
                 (False, '(VP (VBZ is) (NP (DT a) (ADJP (RB fairly) (JJ '
                  'simple)) (NN parse) (NN tree)))'), (True, '(VBZ is)'),
                 (False, '(NP (DT a) (ADJP (RB fairly) (JJ simple)) (NN '
                  'parse) (NN tree))'), (True, '(DT a)'),
                 (False, '(ADJP (RB fairly) (JJ simple))'),
                 (True, '(RB fairly)'), (True, '(JJ simple)'),
                 (True, '(NN parse)'), (True, '(NN tree)'), (True, '(. .)')]
        actual_pairs = [(subtree.is_preterminal(), str(subtree))
                        for subtree in tree.all_subtrees()]
        assert pairs == actual_pairs

        # index into a preterminal
        self.assertRaises(IndexError, lambda: tree[0][0][0][0])
        # index a child that doesn't exist
        self.assertRaises(IndexError, lambda: tree[500])
        self.assertRaises(IndexError, lambda: tree[0][0][7777])
        self.assertRaises(IndexError, lambda: tree[-30])

        # repr shouldn't crash, but we don't check (or rely on) its form
        repr(tree)
        repr(tree[0])
        repr(tree[0][1])
        repr(tree[0][1][0])
Example #20
0
    """Yields all labeled span in the tree: (start, end, label)."""
    for subtree in tree.all_subtrees():
        start, end = subtree.span()
        yield (start, end, subtree.label)

def strip_function_tags(tree):
    """Removes all function tags from the tree."""
    for subtree in tree.all_subtrees():
        subtree.label_suffix = ''

if __name__ == "__main__":
    from bllipparser import Tree

    tree = Tree('''(S1 (S (VP (VBZ Swears) (SBAR (SBAR (S (NP-SBJ
    (PRP she)) (VP (VBD recognized) (NP (PRP$ his) (NN voice))))) (, ,)
    (SBAR (IN that) (S (NP-SBJ (NNP Tim)) (VP (VBD fired) (, ,) ('' ')
    (S (S (NP-SBJ (PRP It)) (VP (VBZ 's) (NP-PRD (PRP$ my) (NN money))))
    (CC and) (S (NP-SBJ (PRP I)) (VP (VBP want) (NP (PRP it)))))
    ('' ')))))) (. !)))''')

    for labeled_span in labeled_spans(tree):
        print labeled_span

    print '---'
    for subtree in tree.all_subtrees():
        print is_prepreterminal(subtree), subtree
    print '---'
    strip_function_tags(tree)
    for subtree in tree.all_subtrees():
        print is_prepreterminal(subtree), subtree
Example #21
0
def main(transcript):

    # results = {"0": "1.0", "1": "0.9747",
    #            "2": "0.968", "3": "0.8859", "4": "0.7071"}
    # print(json.dumps(results))

    results = {}
    sentences = sent_tokenize(transcript)
    '''
        Declaration of constants and functions
    '''

    CONS_SATIRIC = 0
    CONS_RELIABLE = 1
    rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=False)
    foo = TripletExtraction()
    bar = SemanticSimilarityAnalysis()
    '''
        2 database tables for comparison of input 
    '''
    c.execute('SELECT title FROM reliable_news')
    reliable_news = [tup[0] for tup in c.fetchall()]

    c.execute('SELECT title FROM satirical_news')
    satirical_news = [tup[0] for tup in c.fetchall()]

    t = len(sentences)
    correct_classifications = 0
    for i in range(t):
        max_similarity = 0
        classification = -1
        max_sentence = ""

        inp = sentences[i]
        ''' 
            generates the tree and gets the SVO of the input sentence
        '''
        tree_inp = Tree(rrp.simple_parse(inp))
        svo_inp = foo.getSVO(tree_inp[0])
        '''
            comparison for satirical and reliable news
        '''
        for title in satirical_news:
            for subj in svo_inp['subject']:
                if subj[2] == 0:
                    continue
                words = [x.lower() for x in sentence_tokenizer.tokenize(title)]
                if subj[0] in words or singularize(subj[0]) in words:
                    tree_data = Tree(rrp.simple_parse(title))
                    svo_data = foo.getSVO(tree_data[0])

                    similarity_score1 = bar.get_similarities(svo_inp, svo_data)
                    '''
                        object and subject swapped to provde more possible comparisons
                    '''
                    svo_data['subject'], svo_data['object'] = svo_data[
                        'object'], svo_data['subject']
                    similarity_score2 = bar.get_similarities(svo_inp, svo_data)

                    similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2
                    if similarity_score > max_similarity:
                        classification = 0
                        max_similarity = similarity_score
                        max_sentence = title
                    break

        for title in reliable_news:
            for sht in satiric_shits:
                title = title.replace(sht, "")
            for subj in svo_inp['subject']:
                if subj[2] == 0:
                    continue
                words = [x.lower() for x in sentence_tokenizer.tokenize(title)]
                if subj[0] in words or singularize(subj[0]) in words:
                    tree_data = Tree(rrp.simple_parse(title))
                    svo_data = foo.getSVO(tree_data[0])

                    similarity_score1 = bar.get_similarities(svo_inp, svo_data)
                    '''
                        object and subject swapped to provde more possible comparisons
                    '''
                    svo_data['subject'], svo_data['object'] = svo_data[
                        'object'], svo_data['subject']
                    similarity_score2 = bar.get_similarities(svo_inp, svo_data)

                    similarity_score = similarity_score1 if similarity_score1 > similarity_score2 else similarity_score2
                    if similarity_score > max_similarity:
                        classification = 1
                        max_similarity = similarity_score
                        max_sentence = title
                    break
        if classification == CONS_RELIABLE:
            results[str(i)] = str(round(max_similarity, 4))
        elif classification == CONS_SATIRIC:
            results[str(i)] = str(round(-max_similarity, 4))
        else:
            results[str(i)] = "0"
    print(json.dumps(results))
Example #22
0
        break
    if good:
      new_nbest.append(t)
  return new_nbest


if __name__ == '__main__':
  if len(sys.argv) != 3 and len(sys.argv) != 4:
    print('usage: python traversal.py vocab.gz gold.gz [nbest.gz]')
    sys.exit(0)

  words = read_vocab(sys.argv[1])
  if len(sys.argv) == 3:
    for line in open_file(sys.argv[2]):
      print(ptb(line[:-1], words))
  else:
    rrp = RerankingParser()
    parser = 'wsj/WSJ-PTB3/parser'
    rrp.load_parser_model(parser)
    for gold, nbest in zip(open_file(sys.argv[2]),
                           generate_nbest(open_file(sys.argv[3]))):
      for tree in nbest:
        tree['seq'] = ptb(tree['ptb'], words)
      nbest = remove_duplicates(nbest)
      gold = Tree(gold)
      print(len(nbest))
      for t in nbest:
        scores = Tree(t['ptb']).evaluate(gold)
        print(scores['gold'], scores['test'], scores['matched'])
        print(t['seq'])
Example #23
0
def ptb(line, words):
  t = Tree(line)
  forms = []
  ptb_recurse(t.subtrees()[0], words, forms)
  return ' ' + ' '.join(forms) + ' '
Example #24
0
    for tree_span in sorted(tree1_spans | tree2_spans):
        words = ' '.join(tokens[tree_span[0]:tree_span[1]])
        marker = ' '
        if tree_span not in tree2_spans:
            marker = '-'
        elif tree_span not in tree1_spans:
            marker = '+'
        elif not show_all:
            continue
        print marker, tree_span, words


if __name__ == "__main__":
    from bllipparser import Tree

    tree1 = Tree('''(S1 (SQ (VBZ Swears) (NP (PRP she)) (VP (VBD
    recognized) (NP (PRP$ his) (NN voice)) (, ,) (SBAR (IN that) (S
    (NP (NNP Tim)) (VP (VBD fired)))) (, ,) ('' ') (S (S (NP (PRP It))
    (VP (VBZ 's) (NP (PRP$ my) (NN money)))) (CC and) (S (NP (PRP I))
    (VP (VBP want) (S (NP (PRP it)) (VP (POS '))))))) (. !)))''')

    tree2 = Tree('''(S1 (S (VP (VBZ Swears) (SBAR (SBAR (S (NP-SBJ
    (PRP she)) (VP (VBD recognized) (NP (PRP$ his) (NN voice))))) (, ,)
    (SBAR (IN that) (S (NP-SBJ (NNP Tim)) (VP (VBD fired) (, ,) ('' ')
    (S (S (NP-SBJ (PRP It)) (VP (VBZ 's) (NP-PRD (PRP$ my) (NN money))))
    (CC and) (S (NP-SBJ (PRP I)) (VP (VBP want) (NP (PRP it)))))
    ('' ')))))) (. !)))''')

    tree_diff(tree1, tree2, show_all=True)