Ejemplo n.º 1
0
    def test_labeled_nodes(self):
        '''
        Test labeled nodes.

        Test case from Emily M. Bender.
        '''
        search = '''
            # macros
            @ SBJ /SBJ/;
            @ VP /VP/;
            @ VB /VB/;
            @ VPoB /V[PB]/;
            @ OBJ /OBJ/;

            # 1 svo
            S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
        sent1 = ParentedTree.fromstring(
            '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))')
        sent2 = ParentedTree.fromstring(
            '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))')
        search_firsthalf = (search.split('\n\n')[0] +
                            'S < @SBJ < (@VP < (@VB $.. @OBJ))')
        search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'

        self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
        self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
        self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
        self.assertEqual(list(tgrep.tgrep_positions(search, [sent1])),
                         list(tgrep.tgrep_positions(search_rewrite, [sent1])))
        self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
        self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
        self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
        self.assertEqual(list(tgrep.tgrep_positions(search, [sent2])),
                         list(tgrep.tgrep_positions(search_rewrite, [sent2])))
Ejemplo n.º 2
0
def lappinleasse(parsetree, i):
    global entitySet
    for np in parsetree.subtrees(lambda x: x.label() == 'NP'):
        if 'PRP' in np[0].label():
            if np[0,0].lower() == 'it' and ispleonastic(np, parsetree): continue
            maxsalience = -1
            referent = None
            e = Entity(np, parsetree, i)
            for entity in entitySet:
                if entity.sentencenum >= i - 4 and e.agreeswith(entity) and maxsalience < entity.salience:
                    maxsalience = entity.salience
                    referent = entity
            try:
                referent.salience += e.salience
                referent.gender = e.gender
                referent.phrases.add(np[0,0] + str(i))
                orig = np[0,0]
                if np[0].label() == 'PRP$':
                    np[0] = ParentedTree.fromstring('(SUB <'+ referent.name + "'s>)")
                    print('PRP$ substitution', orig, '-->', referent.name)
                else:
                    np[0] = ParentedTree.fromstring('(SUB <' + referent.name + '>)')
                    print('PRP substitution', orig, '-->', referent.name)
            except:
                print('No substitution found for ', orig)
                continue

        elif np[0].label() == 'EX': continue
        else: entitySet.add(Entity(np, parsetree, i))
#    print('Discourse model after sentence', i + 1, ':')
#    for entity in entitySet: print(entity)
    halve()
Ejemplo n.º 3
0
def test_exact_match():
    tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN cat)) (VP bit) (NP (DT a) (NN cat)))')
    node = search_by_exact_string_matching(tree, 'cat')
    assert_equal(len(node), 2)
    assert_equal(node[0], ParentedTree.fromstring('(NN cat)'))

    node = search_by_exact_string_matching(tree, 'a cat')
    assert_equal(len(node), 1)
    assert_equal(node[0], ParentedTree.fromstring('(NP (DT a) (NN cat))'))
Ejemplo n.º 4
0
def merge_tree_nnps(tree):
    """
    Takes a parse tree and merges any consecutive leaf nodes that come from NNPs
    For example if there is a segment of:
        (NP
            (JJ old)
            (NNP Pierre)
            (NNP Vinken)
        )
    Returns:
        (NP
            (JJ old)
            (NNP PierreVinken)
        )
    """

    # require a parented tree to get a subtrees tree position
    p = ParentedTree.convert(tree)

    # iterates subtrees of height 3. This is where NP's leading to NNP's leading to lexicalizations will be
    for s in p.subtrees(filter=lambda s: s.height() == 3):
        # merge NNP's in the list representation of this trees children: [(POS, word), ...] 
        new_noun_phrase = merge_tagged_nnps([(c.label(), c[0]) for c in s])
        child_str = " ".join("(%s %s)" % (pos, word) for pos, word in new_noun_phrase)
        # create new subtree with merged NNP's
        new_s = ParentedTree.fromstring("(%s %s)" % (s.label(), child_str))

        # replace old subtree with new subtree
        p[s.treeposition()] = new_s
    return Tree.convert(p)
Ejemplo n.º 5
0
 def test_node_nocase(self):
     '''
     Test selecting nodes using case insensitive node names.
     '''
     tree = ParentedTree.fromstring('(S (n x) (N x))')
     self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]])
     self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]])
Ejemplo n.º 6
0
def getConsituentTreeDistribution(core_nlp_files):
    diff_productions = dict()
    production_dict_for_files = dict()
    for genre_file_path, genre_file_name in core_nlp_files:
        production_dict = dict()
        dictionary = dict()
        with open(genre_file_path) as f:
            lines = f.readlines()
            assert len(lines) == 1
            line = lines[0]
            line = 'dictionary=' + line
            exec(line)
            # print genre_file_path, dictionary
            sentences = dictionary[SENTENCES]
            for sent in sentences:
                parsetree = sent[PARSE_TREE]
                t = ParentedTree.fromstring(parsetree)
                prods = t.productions()
                for prod in prods:
                    if prod not in diff_productions:
                        diff_productions[prod] = 0.0
                    if prod not in production_dict:
                        production_dict[prod] = 0.0
                    diff_productions[prod] += 1.0
                    production_dict[prod] += 1.0
            production_dict_for_files[genre_file_name.replace('_corenlp1000.txt', '.txt')] = production_dict
    return production_dict_for_files, diff_productions
Ejemplo n.º 7
0
 def test_rel_precedence(self):
     '''
     Test matching nodes based on precedence relations.
     '''
     tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))'
                                    ' (VP (AP (X (PP x)) (Y (AP x))))'
                                    ' (NP (RC (NP (AP x)))))')
     self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])),
                      [[(0,), (0, 1), (0, 1, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])),
                      [[(1, 0, 0), (1, 0, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* .. X', [tree])),
                      [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* .. Y', [tree])),
                      [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0),
                       (1, 0, 0), (1, 0, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])),
                      [[(1, 0, 1), (1, 0, 1, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* , Y', [tree])),
                      [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* ,, X', [tree])),
                      [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0),
                       (2, 0, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('* ,, Y', [tree])),
                      [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]])
def findSentencePTreeToken(sentence, keyword):
	import nltk
	from nltk.tree import ParentedTree
	stemmed = _lemma_(keyword)

	tmp = proc.parse_doc(sentence)
	i = 0
	numSentences = len(tmp['sentences'])
	rs = []
	for i in range(0, numSentences):
		p = tmp['sentences'][i]['parse']
		ptree = ParentedTree.fromstring(p)

		# rs = []
		for i in range(0, len(ptree.leaves())):
			tree_position = ptree.leaf_treeposition(i)

			node = ptree[tree_position]

			if _stem_(node)==stemmed:
				tree_position = tree_position[0:len(tree_position)-1]
				rs.append(ptree[tree_position])
		# if len(rs)>0:
		# 	return rs
	return rs
Ejemplo n.º 9
0
 def test_use_macros(self):
     '''
     Test defining and using tgrep2 macros.
     '''
     tree = ParentedTree.fromstring(
         '(VP (VB sold) (NP (DET the) '
         '(NN heiress)) (NP (NN deed) (PREP to) '
         '(NP (DET the) (NN school) (NN house))))'
     )
     self.assertEqual(
         list(
             tgrep.tgrep_positions(
                 '@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree]
             )
         ),
         [[(1,), (2, 2)]],
     )
     # use undefined macro @CNP
     self.assertRaises(
         tgrep.TgrepException,
         list,
         tgrep.tgrep_positions(
             '@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree]
         ),
     )
Ejemplo n.º 10
0
 def disfile2tree(dis_filepath):
     """converts a *.dis file into a ParentedTree (NLTK) instance"""
     with open(dis_filepath) as f:
         rst_tree_str = f.read().strip()
         rst_tree_str = fix_rst_treebank_tree_str(rst_tree_str)
         rst_tree_str = convert_parens_in_rst_tree_str(rst_tree_str)
         return ParentedTree.fromstring(rst_tree_str)
Ejemplo n.º 11
0
 def test_node_printing(self):
     '''Test that the tgrep print operator ' is properly ignored.'''
     tree = ParentedTree.fromstring('(S (n x) (N x))')
     self.assertEqual(list(tgrep.tgrep_positions('N', [tree])),
                      list(tgrep.tgrep_positions('\'N', [tree])))
     self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])),
                      list(tgrep.tgrep_positions('\'/[Nn]/', [tree])))
Ejemplo n.º 12
0
 def test_node_regex(self):
     '''
     Test regex matching on nodes.
     '''
     tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
     # This is a regular expression that matches any node whose
     # name starts with NP, including NP-SBJ:
     self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]])
Ejemplo n.º 13
0
 def test_bad_operator(self):
     '''
     Test error handling of undefined tgrep operators.
     '''
     tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
     self.assertRaises(
         tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree])
     )
Ejemplo n.º 14
0
 def test_bad_operator(self):
     '''
     Test error handling of undefined tgrep operators.
     '''
     tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
     self.assertRaises(
         tgrep.TgrepException,
         list,
         tgrep.tgrep_positions('* >>> S', [tree]))
Ejemplo n.º 15
0
 def test_node_noleaves(self):
     '''
     Test node name matching with the search_leaves flag set to False.
     '''
     tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
     self.assertEqual(list(tgrep.tgrep_positions('x', [tree])),
                      [[(0, 0, 0), (1, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)),
                      [[]])
Ejemplo n.º 16
0
 def test_node_quoted(self):
     '''
     Test selecting nodes using quoted node names.
     '''
     tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
     self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
     self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
     self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
     self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
    def addSentence(sentence):
        output = nlp.annotate(sentence, properties={
            'annotators': 'parse',
            'outputFormat': 'json'
        })

        tr = ParentedTree.fromstring(output['sentences'][0]['parse'])
        le.append(tr.leaves())
        assignPhrases(tr)
def vertical_imbalance(furcation_node_dict):
    max_sd = 0
    for node in furcation_node_dict:
        node = ParentedTree.fromstring(node)
        child_heights = numpy.array([child.height() for child in node])
        sd = numpy.std(child_heights)
        if sd > max_sd:
            max_sd = sd
    return max_sd
def horizontal_imbalance(furcation_node_dict):
    max_sd = 0
    for node in furcation_node_dict:
        node = ParentedTree.fromstring(node)
        child_widhts = numpy.array([len(child.leaves()) for child in node])
        sd = numpy.std(child_widhts)
        if sd > max_sd:
            max_sd = sd
    return max_sd
Ejemplo n.º 20
0
def get_ptree(sent_df):
    sentence = sent_df.to_dict('records')
    # create tree with token_identifiers as leaves
    tree_string = "".join([
        token["parse"].replace("*", f" {token['token_id']} ")
        for token in sentence
    ])
    ptree = ParentedTree.fromstring(tree_string)
    return ptree
Ejemplo n.º 21
0
def test_regexp_search():
    tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))')
    
    regexp = TreeRegexp('NP', 
                        [TreeRegexp('DT', ['the']), 
                         TreeRegexp('JJ', ['big']), 
                         TreeRegexp('NN', [MatchAllNode()])])
    
    nodes = search_by_tree_regexp(tree, regexp)
    assert_equal(len(nodes), 1)
    assert_equal(nodes[0], 
                 ParentedTree.fromstring('(NP (DT the) (JJ big) (NN dog))'))
    
    regexp = TreeRegexp('NN', [MatchAllNode()])
    nodes = search_by_tree_regexp(tree, regexp)
    assert_equal(len(nodes), 2)
    assert_equal(nodes[0], ParentedTree.fromstring('(NN dog)'))
    assert_equal(nodes[1], ParentedTree.fromstring('(NN cat)'))
Ejemplo n.º 22
0
def analyze_s_expression() -> None:
    root = ET.parse("../data/nlp.txt.xml")
    for s_exp in root.iterfind("./document/sentences/sentence/parse"):
        # S 式の文字列から tree を作成する
        tree = ParentedTree.fromstring(s_exp.text)
        for sub in tree.subtrees():
            # 名詞句の場合、その葉をすべて表示する
            if sub.label() == "NP":
                print(" ".join(list(sub.leaves())))
Ejemplo n.º 23
0
 def test_node_noleaves(self):
     '''
     Test node name matching with the search_leaves flag set to False.
     '''
     tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))')
     self.assertEqual(list(tgrep.tgrep_positions('x', [tree])),
                      [[(0, 0, 0), (1, 0, 0)]])
     self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)),
                      [[]])
Ejemplo n.º 24
0
 def test_node_quoted(self):
     '''
     Test selecting nodes using quoted node names.
     '''
     tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))')
     self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]])
     self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]])
     self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]])
     self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]])
Ejemplo n.º 25
0
 def test_node_regex(self):
     '''
     Test regex matching on nodes.
     '''
     tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))')
     # This is a regular expression that matches any node whose
     # name starts with NP, including NP-SBJ:
     self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])),
                      [[(0, ), (1, )]])
 def test_rel_sister_nodes(self):
     '''
     Test matching sister nodes in a tree.
     '''
     tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
     self.assertEqual(tgrep.tgrep_positions(tree, '* $. B'),  [(0,)])
     self.assertEqual(tgrep.tgrep_positions(tree, '* $.. B'), [(0,)])
     self.assertEqual(tgrep.tgrep_positions(tree, '* $, B'),  [(2,)])
     self.assertEqual(tgrep.tgrep_positions(tree, '* $,, B'), [(2,)])
     self.assertEqual(tgrep.tgrep_positions(tree, '* $ B'),   [(0,), (2,)])
def convert_tree_json(input_json):
    '''
    convert the JSON from the RST parser into a format for D3.js.
    '''
    tree = ParentedTree.fromstring(input_json["scored_rst_trees"][0]["tree"])
    edus = [' '.join(x) for x in input_json["edu_tokens"]]

    res = convert_tree_json_helper(tree, edus)

    return res
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("mrg_path", help="a file with constituent trees in mrg format.")
    args = parser.parse_args()

    with open(args.mrg_path) as constituent_file:
        for line in constituent_file:
            tree = ParentedTree.fromstring(line.strip())
            actseq = extract_parse_actions(tree)
            print(" ".join(["{}:{}".format(x.type, x.label) for x in actseq]))
Ejemplo n.º 29
0
 def test_rel_sister_nodes(self):
     """
     Test matching sister nodes in a tree.
     """
     tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
     self.assertEqual(list(tgrep.tgrep_positions("* $. B", [tree])), [[(0,)]])
     self.assertEqual(list(tgrep.tgrep_positions("* $.. B", [tree])), [[(0,)]])
     self.assertEqual(list(tgrep.tgrep_positions("* $, B", [tree])), [[(2,)]])
     self.assertEqual(list(tgrep.tgrep_positions("* $,, B", [tree])), [[(2,)]])
     self.assertEqual(list(tgrep.tgrep_positions("* $ B", [tree])), [[(0,), (2,)]])
Ejemplo n.º 30
0
def read_and_prep_file(filename):
    """Read contents of a file, tokenize contents and turn into a tree
    Args: filename(str): a filename with full path to be loaded
    Returns: ptree (parented tree nltk object): a tree object native to NLTK that automatically maintains parent pointer in every node.     
    First off we initiallize SExprTokenizer a tool in nltk used to find parenthesized expressions 
    """
    tokenizer = SExprTokenizer()
    with open(filename, 'r') as f:
        tmp_strings = tokenizer.tokenize(''.join(map(str.strip,f.readlines()))) #read all the lines(f.readlines() produces a list), strip all bad characters(spaces, newlines, etc) and join into one large senteze and then use the SE tokenizer
        return [ParentedTree.fromstring(tmp) for tmp in tmp_strings] # for each string in file terun a tree 
Ejemplo n.º 31
0
def convert_tree_json(input_json):
    '''
    convert the JSON from the RST parser into a format for D3.js.
    '''
    tree = ParentedTree.fromstring(input_json["scored_rst_trees"][0]["tree"])
    edus = [' '.join(x) for x in input_json["edu_tokens"]]

    res = convert_tree_json_helper(tree, edus)

    return res
Ejemplo n.º 32
0
 def test_rel_sister_nodes(self):
     '''
     Test matching sister nodes in a tree.
     '''
     tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
     self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])),  [[(0,)]])
     self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]])
     self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])),  [[(2,)]])
     self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]])
     self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])),   [[(0,), (2,)]])
Ejemplo n.º 33
0
def read_txt_csv_graphs(f, warnings=True):
    """Read a file in txt.csv format, i.e. a tab-separated file with 21
    columns, convert the dependency parses to networkx graphs and the
    phrase structure trees to NLTK ParentedTrees.

    """
    def attributes(t):
        return int(t[3]), {
            "token": t[6],
            "lemma": t[7],
            "cpos": t[8],
            "pos": t[9],
            "morphology": t[11]
        }

    sentences = read_txt_csv_sentences(f)
    for sentence in sentences:
        sentence_id = sentence[0][2]
        g = networkx.DiGraph(sentence_id=sentence_id)
        g.add_nodes_from([attributes(t) for t in sentence])
        tree = []
        for token in sentence:
            tid = int(token[3])
            gov = int(token[13])
            rel = token[14]
            tree_frag = token[18]
            if gov == -1:
                g.node[tid]["root"] = "root"
            else:
                g.add_edge(gov, tid, relation=rel)
            tree_tok = token[6]
            tree_tok = tree_tok.replace("(", "-LRB-")
            tree_tok = tree_tok.replace(")", "-RRB-")
            tree_pos = token[9]
            tree_pos = tree_pos.replace("(", "-LRB-")
            tree_pos = tree_pos.replace(")", "-RRB-")
            tree_frag = tree_frag.replace("*",
                                          "(%s %s)" % (tree_pos, tree_tok))
            tree.append(tree_frag)
        tree = "".join(tree)
        sensible, explanation = is_sensible_graph(g)
        if sensible:
            try:
                tree = ParentedTree.fromstring(tree)
            except ValueError:
                if warnings:
                    logging.warn(
                        "Failed to construct parse tree. Ignoring sentence with ID %s: %s"
                        % (sentence_id, tree))
            else:
                yield g, tree
        else:
            if warnings:
                logging.warn("%s. Ignoring sentence with ID %s." %
                             (explanation, sentence_id))
	def process_raw_output(self, output, merge_results=False):
		output = output.replace('\r', '')
		lines1 = output.split('\n')
		lines = []
		i = -1
		for l in lines1:
			if not l.strip() == '':
				if i < 0 and l.find('(0') >= 0:
					i = len(lines)
				lines.append(l)
		return_value = []
		global g_lines
		g_lines = lines
		if i == -1:
			i = 0
		while True:
			#print(i)
			if i+1 >= len(lines):
				break
			tree_string = lines[i].strip()
			#print(tree_string)
			#LOG.info(i)
			#LOG.info("The tree string is %s" % tree_string)
			g_tree_string = tree_string
			try:
				tree = ParentedTree.fromstring(tree_string.strip())
			except Exception as ex:
				tree = ''
				LOG.info("got exception processing tree(%s) %s" % (tree_string, ex))
				break
			probs = re.sub(' +', ' ', lines[i+1].strip()).split(' ')[1:]
			score = self.probs_to_score(probs)
			nodes = len(list(tree.subtrees()))
			#LOG.info("the number of nodes are %d" % nodes)
			i = i + nodes+1
			#print("Nodes: %d" % nodes)
			#print(i)
			sentence = ' '.join(tree.leaves())
			sentence = re.sub(" +", " ", sentence)
			sentence = re.sub(" \.", ".", sentence)
			return_value.append({"score":score, "tree":tree, "text":sentence})
		if merge_results:
			return_value_1 = {}
			texts = []
			s = 0
			for rv in return_value:
				s = s + rv["score"]
				texts.append(rv['text'])
			n = len(return_value)
			if n == 0:
				n = 1
			return_value_1['score'] = s/n
			return_value_1['text'] = ' '.join(texts)
			return return_value_1
		return return_value
Ejemplo n.º 35
0
    def test_labeled_nodes(self):
        '''
        Test labeled nodes.

        Test case from Emily M. Bender.
        '''
        search = '''
            # macros
            @ SBJ /SBJ/;
            @ VP /VP/;
            @ VB /VB/;
            @ VPoB /V[PB]/;
            @ OBJ /OBJ/;

            # 1 svo
            S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'''
        sent1 = ParentedTree.fromstring(
            '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))')
        sent2 = ParentedTree.fromstring(
            '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))')
        search_firsthalf = search.split(
            '\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))'
        search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))'

        self.assertTrue(
            list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0])
        self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0])
        self.assertTrue(
            list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0])
        self.assertEqual(
            list(tgrep.tgrep_positions(search, [sent1])),
            list(tgrep.tgrep_positions(search_rewrite, [sent1])),
        )
        self.assertTrue(
            list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0])
        self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0])
        self.assertFalse(
            list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0])
        self.assertEqual(
            list(tgrep.tgrep_positions(search, [sent2])),
            list(tgrep.tgrep_positions(search_rewrite, [sent2])),
        )
Ejemplo n.º 36
0
 def test_node_regex_2(self):
     '''
     Test regex matching on nodes.
     '''
     tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
     self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])),
                      [[(0, ), (1, )]])
     # This is a regular expression that matches any node whose
     # name includes SBJ, including NP-SBJ:
     self.assertEqual(list(tgrep.tgrep_positions('/SBJ/', [tree])),
                      [[(0, ), (1, ), (2, )]])
Ejemplo n.º 37
0
def inSamePhrase(treeStr, pn, candidate):
    try:
        constTree = ParentedTree.fromstring(treeStr)
    except:
        return False
    else:
        phrase = findSmallestPhrase(constTree, pn)
    if candidate in " ".join(phrase.leaves()):
        return True
    else:
        return False
Ejemplo n.º 38
0
 def test_node_printing(self):
     """Test that the tgrep print operator ' is properly ignored."""
     tree = ParentedTree.fromstring("(S (n x) (N x))")
     self.assertEqual(
         list(tgrep.tgrep_positions("N", [tree])),
         list(tgrep.tgrep_positions("'N", [tree])),
     )
     self.assertEqual(
         list(tgrep.tgrep_positions("/[Nn]/", [tree])),
         list(tgrep.tgrep_positions("'/[Nn]/", [tree])),
     )
Ejemplo n.º 39
0
def get_reason(sentence, nlp):
    processed = preprocess(sentence)
    if len(processed) != 0:
        splits = re.compile("[,,。,]").split(processed)
        results = [nlp.parse(s) for s in splits if s != ""]
        trees = [ParentedTree.fromstring(result) for result in results]
        final_result = find_reason(trees)
        print(final_result)
        if len(final_result) != 0:
            return "".join(final_result)
    return None
Ejemplo n.º 40
0
 def test_node_printing(self):
     '''Test that the tgrep print operator ' is properly ignored.'''
     tree = ParentedTree.fromstring('(S (n x) (N x))')
     self.assertEqual(
         list(tgrep.tgrep_positions('N', [tree])),
         list(tgrep.tgrep_positions('\'N', [tree])),
     )
     self.assertEqual(
         list(tgrep.tgrep_positions('/[Nn]/', [tree])),
         list(tgrep.tgrep_positions('\'/[Nn]/', [tree])),
     )
Ejemplo n.º 41
0
 def test_node_regex_2(self):
     '''
     Test regex matching on nodes.
     '''
     tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))')
     self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])),
                      [[(0,), (1,)]])
     # This is a regular expression that matches any node whose
     # name includes SBJ, including NP-SBJ:
     self.assertEqual(list(tgrep.tgrep_positions('/SBJ/', [tree])),
                      [[(0,), (1,), (2,)]])
Ejemplo n.º 42
0
def main():  # noqa: D103
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "mrg_path", help="A file with constituent trees in ``mrg`` format.")
    args = parser.parse_args()

    with open(args.mrg_path) as constituent_file:
        for line in constituent_file:
            tree = ParentedTree.fromstring(line.strip())
            actseq = extract_parse_actions(tree)
            print(" ".join([f"{act.type}:{act.label}" for act in actseq]))
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('mrg_path',
                        help='a file with constituent trees in mrg format.')
    args = parser.parse_args()

    with open(args.mrg_path) as constituent_file:
        for line in constituent_file:
            tree = ParentedTree.fromstring(line.strip())
            actseq = extract_parse_actions(tree)
            print(' '.join(['{}:{}'.format(x.type, x.label) for x in actseq]))
    def __init__(self, tree_raw, stoi, device):
        nltk_tree = ParentedTree.fromstring(tree_raw)

        for leaf_idx in nltk_tree.treepositions('leaves'):
            if nltk_tree[leaf_idx] in stoi:
                nltk_tree[leaf_idx] = stoi[nltk_tree[leaf_idx]]
            else:
                nltk_tree[leaf_idx] = stoi[_UNK]

        self.device = device
        self.root = self.parse(nltk_tree)
        self.labels = self._get_labels_(self.root)
Ejemplo n.º 45
0
def demo_stanford_parser(sentence):
    nlp = StanfordNLP()
    result = nlp.parse(sentence)
    pprint(result)

    from nltk.tree import ParentedTree
    nlpparsetree = result['sentences'][0]['parsetree']
    parsetree = nlpparsetree[nlpparsetree.index('(ROOT'):]
    tree = ParentedTree.fromstring(parsetree)
    tree.pretty_print()
    pprint(tree)
    pprint(tree.pos())
Ejemplo n.º 46
0
def demo_stanford_parser(sentence):
    nlp = StanfordNLP()
    result = nlp.parse(sentence)
    pprint(result)

    from nltk.tree import ParentedTree
    nlpparsetree = result['sentences'][0]['parsetree']
    parsetree = nlpparsetree[nlpparsetree.index('(ROOT'):]
    tree = ParentedTree.fromstring(parsetree)
    tree.pretty_print()
    pprint(tree)
    pprint(tree.pos())
Ejemplo n.º 47
0
def read_tsv_sentences(f, *, ignore_case=False, warnings=True):
    """Read a tab-separated file with six columns: word index, word,
    part-of-speech tag, index of dependency head, dependency relation,
    phrase structure tree. There must be an empty line after each
    sentence. Missing values can be replaced with an underscore (_).

    """
    def attributes(t):
        return {"word": t.word, "pos": t.pos}

    for sent_id, sentence in enumerate(_get_sentences(f, ignore_case)):
        tokens = [Token(t.word, t.pos) for t in sentence]
        if all((t.head != "_" for t in sentence)) and all(
            (t.deprel != "_" for t in sentence)):
            g = networkx.DiGraph(sentence_id=sent_id)
            g.add_nodes_from([(i, attributes(t))
                              for i, t in enumerate(sentence)])
            id_to_enumeration = {t.id: i for i, t in enumerate(sentence)}
            for i, token in enumerate(sentence):
                if token.head == "-1":
                    g.nodes[i]["root"] = "root"
                else:
                    g.add_edge(id_to_enumeration[token.head],
                               i,
                               relation=token.deprel)
            sensible, explanation = graph.is_sensible_graph(g)
            if warnings and not sensible:
                logging.warn("Ignoring sentence %s: %s" %
                             (sent_id, explanation))
        if all((t.pstree != "_" for t in sentence)) and sensible:
            tree_src = []
            tree = None
            for token in sentence:
                tree_tok = token.word
                tree_tok = tree_tok.replace("(", "-LRB-")
                tree_tok = tree_tok.replace(")", "-RRB-")
                tree_pos = token.pos
                tree_pos = tree_pos.replace("(", "-LRB-")
                tree_pos = tree_pos.replace(")", "-RRB-")
                tree_frag = token.pstree
                tree_frag = tree_frag.replace("*",
                                              "(%s %s)" % (tree_pos, tree_tok))
                tree_src.append(tree_frag)
            tree_src = "".join(tree_src)
            try:
                tree = ParentedTree.fromstring(tree_src)
            except ValueError:
                logging.warn(
                    "Failed to construct parse tree from sentence %s: %s" %
                    (sent_id, tree_src))
                tree = None
        if sensible and tree is not None:
            yield tokens, g, tree
Ejemplo n.º 48
0
 def tests_rel_indexed_children(self):
     '''
     Test matching nodes based on their index in their parent node.
     '''
     tree = ParentedTree.fromstring('(S (A x) (B x) (C x))')
     self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])),
                      [[(0, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])),
                      [[(0, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])),
                      [[(0, )]])
     tree = ParentedTree.fromstring(
         '(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) '
         '(F (C x) (A x) (B x)))')
     self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])),
                      [[(0, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])),
                      [[(0, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])),
                      [[(0, )]])
Ejemplo n.º 49
0
 def tests_rel_indexed_children(self):
     """
     Test matching nodes based on their index in their parent node.
     """
     tree = ParentedTree.fromstring("(S (A x) (B x) (C x))")
     self.assertEqual(list(tgrep.tgrep_positions("* >, S", [tree])),
                      [[(0, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* >1 S", [tree])),
                      [[(0, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* >2 S", [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* >3 S", [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* >' S", [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* >-1 S", [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* >-2 S", [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* >-3 S", [tree])),
                      [[(0, )]])
     tree = ParentedTree.fromstring(
         "(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) "
         "(F (C x) (A x) (B x)))")
     self.assertEqual(list(tgrep.tgrep_positions("* <, A", [tree])),
                      [[(0, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* <1 A", [tree])),
                      [[(0, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* <2 A", [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* <3 A", [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* <' A", [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* <-1 A", [tree])),
                      [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* <-2 A", [tree])),
                      [[(2, )]])
     self.assertEqual(list(tgrep.tgrep_positions("* <-3 A", [tree])),
                      [[(0, )]])
Ejemplo n.º 50
0
def Rule_SBAR(valid_sbar,numlist,taglist, tr):
	# Step 1: Pull out each segment
	#valid_sbar.sort(key=len)
	valids = [] 
	for each in valid_sbar:
		ele = [item[1] for item in each]
		valids.append(ele)
	lst2 = [item[1] for item in numlist]
	ind = 0
	segments = []
     
	valids = sorted(valids,key=len)
        print "VALIDS", valids
        tempTree = ParentedTree.fromstring(str(tr))
        sublist = list(tempTree.subtrees())
        verbcheck = False
        verblist = []
        for i in range(len(sublist)):
                if sublist[i].label() == 'SBAR':
                        current = sublist[i]
                        while current.left_sibling() is not None:
                                if 'VB' in current.left_sibling().label():
                                        verbcheck = True
                                        verblist.append(i)
                                        break
                                else:
                                        current = current.left_sibling()
        print verbcheck                                       
	while ind < len(valids):
                print "current lst2, ", lst2 
		print "current valids, ", valids[ind]
		lst2,cover = Difference(valids[ind],lst2)

		for index in verblist:
                        if index in valids[ind] and verbcheck == True:
                                cover.extend(lst2)
                                lst2 = []
                
                if cover != []:
                        segments.append(sorted(cover))
                
                ind += 1
        if lst2 != []:
                segments.append(sorted(lst2))
	# Step2: Pull out words from segment 
	ind = 0 
	segt = []
	for seg in segments:
		segt.append(Find_Words(seg, numlist))
	# return segment, and segment id
	print segt,segments
	return segt, segments  
def find_reason(trees, nlp):
    reason = []
    final_result = []
    for tree in trees:
        # tree.pretty_print()
        sentence = "".join(tree.leaves())
        if '@' in sentence:
            continue
        if contain_approver(tree):
            # trees.remove(tree)
            continue
        if contain_type(sentence):
            # trees.remove(tree)
            continue
        # pos, _ = tn.parse(sentence)
        matchObj = re.match(r'请(.*)假', sentence)
        if matchObj is not None:
            a, b = matchObj.span()
            sentence = sentence[:a] + sentence[b:]
            if sentence == "":
                continue
            else:
                tree = ParentedTree.fromstring(nlp.parse(sentence))
            # 判断是否有其他动词
            current_tree = tree
            traverse(current_tree, current_tree)
            # vp = "".join(current_tree.leaves())
            # trees.remove(tree)
            if len(current_tree.leaves()) > 0:
                cnt = 0
                for i in range(len(current_tree.leaves())):
                    if current_tree.leaves()[i] != "要" and current_tree.leaves(
                    )[i] != "想" and current_tree.leaves(
                    )[i] != "准备" and current_tree.leaves()[i] != "打算":
                        final_result.append(current_tree.leaves()[i])
                        cnt = cnt + 1
                if cnt > 0:
                    final_result.append(" ")
            continue
        else:
            temp = traverse_remains(tree)
            if len(temp) > 0:
                cnt = 0
                for i in range(len(temp)):
                    if temp[i] != "要" and temp[i] != "想" and temp[
                            i] != "准备" and temp[i] != "打算":
                        final_result.append(temp[i])
                        cnt = cnt + 1
                if cnt > 0:
                    final_result.append(" ")
    reason.extend(trees)
    return final_result
Ejemplo n.º 52
0
 def test_trailing_semicolon(self):
     '''
     Test that semicolons at the end of a tgrep2 search string won't
     cause a parse failure.
     '''
     tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) '
                                    '(VP bit) (NP (DT a) (NN cat)))')
     self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2),
                                                                   (2, 1)]])
     self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])),
                      [[(0, 2), (2, 1)]])
     self.assertEqual(list(tgrep.tgrep_positions('NN;;', [tree])),
                      [[(0, 2), (2, 1)]])
Ejemplo n.º 53
0
 def test_multiple_conjs(self):
     '''
     Test that multiple (3 or more) conjunctions of node relations are
     handled properly.
     '''
     sent = ParentedTree.fromstring(
         '((A (B b) (C c)) (A (B b) (C c) (D d)))')
     # search = '(A < B < C < D)'
     # search_tworels = '(A < B < C)'
     self.assertEqual(
         list(tgrep.tgrep_positions('(A < B < C < D)', [sent])), [[(1, )]])
     self.assertEqual(list(tgrep.tgrep_positions('(A < B < C)', [sent])),
                      [[(0, ), (1, )]])
Ejemplo n.º 54
0
 def test_node_encoding(self):
     '''
     Test that tgrep search strings handles bytes and strs the same
     way.
     '''
     tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) '
                                    '(VP bit) (NP (DT a) (NN cat)))')
     self.assertEqual(list(tgrep.tgrep_positions(b('NN'), [tree])),
                      list(tgrep.tgrep_positions('NN', [tree])))
     self.assertEqual(list(tgrep.tgrep_nodes(b('NN'), [tree])),
                      list(tgrep.tgrep_nodes('NN', [tree])))
     self.assertEqual(list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
                      list(tgrep.tgrep_positions('NN|JJ', [tree])))
Ejemplo n.º 55
0
 def test_trailing_semicolon(self):
     '''
     Test that semicolons at the end of a tgrep2 search string won't
     cause a parse failure.
     '''
     tree = ParentedTree.fromstring(
         '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))'
     )
     self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0, 2), (2, 1)]])
     self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])), [[(0, 2), (2, 1)]])
     self.assertEqual(
         list(tgrep.tgrep_positions('NN;;', [tree])), [[(0, 2), (2, 1)]]
     )
 def get_pos(self, doc, posTags):
     pos_words = []
     parsed = self.stanford_nlp.parse(doc.lower())
     for sentence_parse in parsed['sentences']:
         nlpparsetree= sentence_parse['parsetree']
         if '(ROOT' in nlpparsetree:
             parsetree = nlpparsetree[nlpparsetree.index('(ROOT'):]
             tree = ParentedTree.fromstring(parsetree)
             tree.pretty_print()
             for word, pos in tree.pos():
                 if pos in posTags:
                     pos_words.append(word)
     return ' '.join(pos_words) 
Ejemplo n.º 57
0
    def get_answer(self, question):
        result = self._nlp.parse(question)

        try:
            tree = ParentedTree.fromstring(result['sentences'][0]['parsetree'])
        except IndexError or KeyError:
            return None

        parser = Parser()
        parser.run(tree)
        answers = parser.answers
        del parser
        return answers
Ejemplo n.º 58
0
 def test_multiple_conjs(self):
     '''
     Test that multiple (3 or more) conjunctions of node relations are
     handled properly.
     '''
     sent = ParentedTree.fromstring(
         '((A (B b) (C c)) (A (B b) (C c) (D d)))')
     # search = '(A < B < C < D)'
     # search_tworels = '(A < B < C)'
     self.assertEqual(list(tgrep.tgrep_positions('(A < B < C < D)', [sent])),
                      [[(1,)]])
     self.assertEqual(list(tgrep.tgrep_positions('(A < B < C)', [sent])),
                      [[(0,), (1,)]])
Ejemplo n.º 59
0
 def test_node_simple(self):
     '''
     Test a simple use of tgrep for finding nodes matching a given
     pattern.
     '''
     tree = ParentedTree.fromstring(
         '(S (NP (DT the) (JJ big) (NN dog)) '
         '(VP bit) (NP (DT a) (NN cat)))')
     self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])),
                      [[(0,2), (2,1)]])
     self.assertEqual(list(tgrep.tgrep_nodes('NN', [tree])),
                      [[tree[0,2], tree[2,1]]])
     self.assertEqual(list(tgrep.tgrep_positions('NN|JJ', [tree])),
                      [[(0, 1), (0, 2), (2, 1)]])
Ejemplo n.º 60
0
 def test_node_encoding(self):
     '''
     Test that tgrep search strings handles bytes and strs the same
     way.
     '''
     tree = ParentedTree.fromstring(
         '(S (NP (DT the) (JJ big) (NN dog)) '
         '(VP bit) (NP (DT a) (NN cat)))')
     self.assertEqual(list(tgrep.tgrep_positions(b('NN'), [tree])),
                      list(tgrep.tgrep_positions('NN', [tree])))
     self.assertEqual(list(tgrep.tgrep_nodes(b('NN'), [tree])),
                      list(tgrep.tgrep_nodes('NN', [tree])))
     self.assertEqual(list(tgrep.tgrep_positions(b('NN|JJ'), [tree])),
                      list(tgrep.tgrep_positions('NN|JJ', [tree])))