Ejemplo n.º 1
0
    def parse_tree(self, text, binary=False, preprocessed=False):
        nlp_output = self.nlp.annotate(text, properties={
            'annotators': 'tokenize,ssplit,pos,parse',
            'outputFormat': 'json',
            'parse.binaryTrees': 'true'
        })
        if type(nlp_output) == str:
            nlp_output = json.loads(nlp_output, strict=False)

        if len(nlp_output['sentences']) > 1:
            #merge trees from sentences
            tree_string = "(Top "
            for s in nlp_output['sentences']:
                p_tree = Tree.fromstring(s['parse'])
                tree_string += str(p_tree[0])
            tree_string += ")"
            merged_tree = Tree.fromstring(tree_string)
        else:
            #no merging required
            merged_tree = Tree.fromstring(nlp_output['sentences'][0]['parse'])
            #remove root
            merged_tree = merged_tree[0]

        if binary:
            nltk.treetransforms.chomsky_normal_form(merged_tree)

        if preprocessed:
            merged_tree = preprocess_parse_tree(merged_tree)

        return merged_tree
def removeNounMods(tree):
    tree_str = tsurgeon.remove_internal_mods(tree)
    if tree_str != '':
        tree = Tree.fromstring(tree_str)
    tree_str = tsurgeon.remove_participle_mods(tree)
    if tree_str != '':
        tree = Tree.fromstring(tree_str)
    return tree
Ejemplo n.º 3
0
def parser_output_to_parse_deriv_trees(output):
    lines = output.strip().split("\n")
    deriv_tree_lines = lines[::2]
    parse_tree_lines = lines[1::2]

    parse_trees = [Tree.fromstring(line.replace('\x06', 'epsilon_')) for line in parse_tree_lines if line != '']
    deriv_trees = [Tree.fromstring(line) for line in deriv_tree_lines if line != '']
    return parse_trees, deriv_trees
Ejemplo n.º 4
0
    def test_flat_parse(self):
        model = Flat([], 'S')  # empty training set

        trees = [model.parse(s) for s in self.tagged_sents]

        trees2 = [
            Tree.fromstring("(S (D El) (N gato) (V come) (N pescado) (P .))"),
            Tree.fromstring("(S (D La) (N gata) (V come) (N salmón) (P .))"),
        ]
        self.assertEqual(trees, trees2)
Ejemplo n.º 5
0
    def test_lbranch_parse(self):
        model = LBranch([], 'S')  # empty training set

        trees = [model.parse(s) for s in self.tagged_sents]

        trees2 = [
            Tree.fromstring("""(S (S|<> (S|<> (S|<> (D El) (N gato)) (V come)) (N pescado)) (P .))"""),
            Tree.fromstring("""(S (S|<> (S|<> (S|<> (D La) (N gata)) (V come)) (N salmón)) (P .))"""),
        ]
        self.assertEqual(trees, trees2)
def extractParticiple(tree):
    part_mod = tsurgeon.hasParticipleMod(tree)
    if part_mod != '':
        subject = tsurgeon.findSubject(tree)
        subject_words = Tree.fromstring(subject).leaves()
        part_tree = Tree.fromstring(part_mod)
        part_words = part_tree.leaves()
        # Ignoring inflection
        result_words = subject_words + ['is'] + part_words[1:]
        sentence = ' '.join(result_words).strip() + '.'
        return sentence
    pass
Ejemplo n.º 7
0
def test_tree4():   
    
    annotator=Annotator()
    sent = "There are people dying make this world a better place for you and for me."
    sent = "Biplab is a good boy." 
    sent = "He created the robot and broke it after making it."
    sent = "Bachelor 's degree in computer science , design or related field."    
    sent = "B.S. in Computer Science , a related degree or its equivalent"    
    sent = "BS , MS , or PhD in Computer Science or a similar field preferred"
    sent = "Computer Science or related technical degree from an accredited four year university "
    sent = "Degree in Computer Science or Engineering with a high GPA ."    
    sent = "A Master's degree in Computer Science or Engineering is mandatory ."
    
    sent = "A Computer Science or related degree "
    sent = "I love science and SciFi book"
    sent = "I love music and SciFi book"
   
    result = annotator.getAnnotations(sent)
    tree_str = result['syntax_tree']
    print     
    print tree_str
    
    tree = Tree.fromstring(tree_str)[0]
    print
    print "Root label=",tree.label()
    tree.draw()
def rulelogic(sentnece):
    leaves_list = []
    text = (sentnece)

    output = nlp.annotate(text, properties={
        'annotators': 'tokenize,ssplit,pos,depparse,parse',
        'outputFormat': 'json'
    })
    parsetree = output['sentences'][0]['parse']
    #print parsetree
    for i in Tree.fromstring(parsetree).subtrees():
        if i.label() == 'PRP':
            #print i.leaves(), i.label()
            leaves_list.append(i.leaves())
        if i.label() == 'VBP' or i.label() == 'VBZ':
            #print i.leaves(), i.label()
            leaves_list.append(i.label())
    #print leaves_list
    if (any("We" in x for x in leaves_list) or any("I" in x for x in leaves_list) or any(
                    "You" in x for x in leaves_list) or any("They" in x for x in leaves_list)) and any("VBZ" in x for x in leaves_list):
        print "Alert: \nPlease check Subject and verb in the sentence.\nYou may have plural subject and singular verb. "
    elif(any("He" in x for x in leaves_list) or any("She" in x for x in leaves_list) or any(
                    "It" in x for x in leaves_list)) and any("VBP" in x for x in leaves_list):
        print "Alert: \nPlease check subject and verb in the sentence.\n" \
              "You may have singular subject and plural verb."
    else:
        print "You have correct sentence."
def removeLeadingMods(tree):
    tree_str = tsurgeon.remove_leading_mods(tree)
    if tree_str != '':
        new = Tree.fromstring(tree_str)
        if new != tree:
            return removeLeadingMods(new)
    return tree
def question(inputstr):
    entities = supersense_tag(inputstr)
#     print("Supersense-tagging done")
    entities.update(named_entities(inputstr))
#     print("NER done")
    main_tree = parser.raw_parse(inputstr).next()
#     print("Parsing done")
    '''
    main_tree_str = save_embedded_clause(main_tree_str)
    print(main_tree_str)
    '''
    main_tree_str = clean_sentence(main_tree)
    
#     Tree.fromstring(main_tree_str).pprint()
    # TODO: mark_unmovable_tags

    main_tree = inverse_verb(main_tree_str)
    sentence = str(' '.join(Tree.fromstring(main_tree_str).leaves()))
    sentence_inversed = str(' '.join(main_tree.leaves()))
    questions = []
    prep = []  # use to store prep when traverse the tree
    gen_question_recur(main_tree, sentence_inversed, sentence, questions, entities, prep)
    questions = [cleanup_question(q) for q in questions]
    questions.append(fix_output(main_tree))
    return questions
Ejemplo n.º 11
0
def tag_var_nodes(vars_dir, trees_dir, tagged_dir):
    """
    Tag variable nodes in tree

    Tag variables nodes in trees with "_VAR:f:n:m:e" suffix where
    f is the name of the parse file,
    n is the tree number,
    m is the variable's node number and
    e is name of the pattern used for extracting this variable.
    Will only output those trees containing at least two variables.
    """
    # At first I used the tregex's '-f' option to print the filename,
    # but when traversing the files in a directory,
    # it prints the wrong filenames (after the first one?),
    # so now the filename is encoded in the node label too.
    tagged_dir = Path(tagged_dir)
    tagged_dir.makedirs_p()

    for vars_fname in Path(vars_dir).glob('*.json'):
        d = defaultdict(list)

        # create a dict mapping each tree number to a list of
        # (nodeNumber, extractName) tuples for its variables
        for record in json.load(vars_fname.open()):
            pair = record['nodeNumber'], record['key']
            d[record['treeNumber']].append(pair)

        lemtree_fname = record['filename']
        parses = (Path(trees_dir) / lemtree_fname).lines()
        tagged_parses = []

        for tree_number, pairs in d.items():
            if len(pairs) > 1:
                # tree numbers in records count from one
                tree = Tree.fromstring(parses[tree_number - 1])
                # get NLTK-style indices for all nodes in a preorder
                # traversal of the tree
                positions = tree.treepositions()
                vars_count = 0

                for node_number, key in pairs:
                    # node numbers in records count from one
                    position = positions[node_number - 1]
                    subtree = tree[position]
                    try:
                        subtree.set_label(
                            '{}_VAR_{}'.format(subtree.label(), key))
                    except AttributeError:
                        log.error('skipping variable "{}" because it is a leaf '
                                  'node ({})'.format(subtree, key))
                    else:
                        vars_count += 1

                if vars_count > 1:
                    tagged_parses.append(tree.pformat(margin=99999))

        if tagged_parses:
            tagged_fname = derive_path(lemtree_fname, new_dir=tagged_dir)
            log.info('writing tagged trees to ' + tagged_fname)
            tagged_fname.write_lines(tagged_parses)
Ejemplo n.º 12
0
    def add_tree(self, datum):
        # parse tree and binarize
        tree = Tree.fromstring(datum["raw_tree"])
        tree.chomsky_normal_form()
        tree.collapse_unary(collapsePOS=True)
        tree = ParentedTree.convert(tree)

        # assign indices to subtrees
        indices = {}
        counter = 0
        for t in tree.subtrees():
            indices[t.treeposition()] = counter
            counter += 1

        # generate parent pointers and labels
        # (labels = one instance of sent in sents by treelstm terminology)
        parents = [0] * (counter - 1)
        labels = []
        counter = 0
        for t in tree.subtrees():
            parent = t.parent()
            if parent != None:
                parents[counter] = indices[parent.treeposition()]
                counter += 1
            if type(t[0]) is str or type(t[0]) is unicode: labels.append(t[0])

        self.parents_file.write(" ".join(map(str, parents)) + "\n")
        self.sents_file.write(" ".join(labels) + "\n")
        self.trees.append(datum)
        return len(self.trees) - 1 # ID
Ejemplo n.º 13
0
    def test_productions(self):
        t = Tree.fromstring(
            """
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)

        # Bugfix from official test (, start='S')
        model = UPCFG([t], start='S')

        prods = model.productions()

        prods2 = [
            ProbabilisticProduction(N('S'), [N('NP'), N('VP')], prob=1.0),
            ProbabilisticProduction(N('NP'), [N('Det'), N('Noun')], prob=0.5),
            ProbabilisticProduction(N('Det'), ['Det'], prob=1.0),
            ProbabilisticProduction(N('Noun'), ['Noun'], prob=1.0),
            ProbabilisticProduction(N('VP'), [N('Verb'), N('NP')], prob=1.0),
            ProbabilisticProduction(N('Verb'), ['Verb'], prob=1.0),
            ProbabilisticProduction(N('NP'), [N('Noun'), N('Adj')], prob=0.5),
            ProbabilisticProduction(N('Adj'), ['Adj'], prob=1.0),
        ]

        self.assertEqual(set(prods), set(prods2))
def read_segtree_file(fn):
    """reads a string representing a discourse tree (from the seg.
       annotation) and returns a list of its child tree objects"""
    with codecs.open(fn, 'r', 'utf-8') as f:
        s = f.read()
        text_tree = Tree.fromstring(s, read_leaf=prefix_number_seg_token)
        return [segment for segment in text_tree]
Ejemplo n.º 15
0
def find_subtrees(tree, depth):
    """
    Returns all subtrees at a given depth

    Arguments
    ---------
    tree: either an nltk.tree.Tree or a PTB-formatted string
    depth: the target depth

    Returns
    -------
    list of nlt.tree.Tree objects representing the selected subtrees

    >>> ptb_str = "(ROOT (S (NP (DT The) (VBG following)) (VP (VBP are) (NP (NP (JJ major) (NN news) (NNS items)) (PP (IN in) (NP (NP (VBG leading) (JJ Turkish) (NNS newspapers)) (PP (IN on) (NP (NNP Monday))))))) (. .)))"
    >>> ptb_tree = Tree.fromstring(ptb_str)   
    >>> subtrees = find_subtrees(ptb_str, 2)  # find_subtrees accepts strings
    >>> [t.label() for t in subtrees]         # and it returns a list of subtrees (ojbects of the kind nlt.tree.Tree)
    ['NP', 'VP', '.']
    >>> subtrees = find_subtrees(ptb_tree, 3) # and trees
    >>> [t.label() for t in subtrees]
    ['DT', 'VBG', 'VBP', 'NP']
    >>> subtrees = find_subtrees(ptb_tree, 4) 
    >>> [t.label() for t in subtrees]
    ['NP', 'PP']
    """
    if isinstance(tree, str):
        tree = Tree.fromstring(tree)
    subtrees = []
    _find_subtrees(tree, 0, depth, subtrees)
    return subtrees
Ejemplo n.º 16
0
    def parse(self, text):
        """
        NOTE: since the Stanford tagger and parser libraries are case-sensitive, the casing of the output of this
              method is preserved. Caller must remember to normalize the casing when conducting comparison
        :param text: text to be parsed
        :return: a SentenceParseResult object
        }
        """
        server = jsonrpc.ServerProxy(jsonrpc.JsonRpc20(),
                                     jsonrpc.TransportTcpIp(addr=(CORENLP_SERVER_HOST, CORENLP_SERVER_PORT)))

        parsed_sentences = loads(server.parse(text))['sentences']
        if len(parsed_sentences) > 1:
            raise Exception('Multi-sentence query is not supported')
        parsed_sentence = parsed_sentences[0]

        word_tokens = [ParsedWordToken(word_wire_format) for word_wire_format in parsed_sentence['words']]
        # word_tokens = self._recover_contractions(word_tokens)

        normalized_sentence = ' '.join([word_token.text for word_token in word_tokens])

        parsed_tree = Tree.fromstring(parsed_sentence['parsetree'])

        word_dependency = SentenceWordDependency(parsed_sentence['dependencies'])

        return SentenceParseResult(word_tokens=word_tokens,
                                   normalized_sentence=normalized_sentence,
                                   parsed_tree=parsed_tree,
                                   word_dependency=word_dependency)
Ejemplo n.º 17
0
def extract_entities(pos_server, assimilator, mode, text, link):
    """
    Extract tokens in the buckets of nouns and other entities
    pos_server: part of speech tagger address
    assimilarot: assimilator address
    mode: metadata or content
    """
    content = get_assimilator_data(mode=mode, assimilator=assimilator, text=text, link=link)
    if mode == "meta":
        import json
        yield json.dumps(json.loads(content.decode()), indent=4)
    else:
        import json
        from .semantic_parser import read_dep
        from nltk.tree import Tree

        concept_map = {}

        pos_generator = process_pos(pos_server, content=content)
        for line in pos_generator:
            data = json.loads(line.decode())
            tree = Tree.fromstring(data['tree'])

            tokens = read_dep(tree)
            yield tokens
Ejemplo n.º 18
0
def test():
    """Do some tree drawing tests."""
    def print_tree(n, tree, sentence=None, ansi=True, **xargs):
        print()
        print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves())))
        print(tree)
        print()
        drawtree = TreePrettyPrinter(tree, sentence)
        try:
            print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs))
        except (UnicodeDecodeError, UnicodeEncodeError):
            print(drawtree.text(unicodelines=False, ansi=False, **xargs))

    from nltk.corpus import treebank
    for n in [0, 1440, 1591, 2771, 2170]:
        tree = treebank.parsed_sents()[n]
        print_tree(n, tree, nodedist=2, maxwidth=8)
    print()
    print('ASCII version:')
    print(TreePrettyPrinter(tree).text(nodedist=2))

    tree = Tree.fromstring(
        '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) '
        '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) '
        '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int)
    sentence = ('Ze had met haar moeder kunnen gaan winkelen ,'
                ' zwemmen of terrassen .'.split())
    print_tree('Discontinuous tree', tree, sentence, nodedist=2)
Ejemplo n.º 19
0
def yngve_redux(treestring):
	""" For the given parsers-tree-string, return the word count and the yngve score. """
	tree = Tree.fromstring(treestring)
	total = float(calc_yngve_score(tree, 0))
	words = float(get_word_score(tree))

	return [total, words]
    def initialize_edu_data(edus):
        '''
        Create a representation of the list of EDUS that make up the input.
        '''

        wnum = 0  # counter for distance features
        res = []
        for edu_index, edu in enumerate(edus):
            # lowercase all words
            edu_words = [x[0].lower() for x in edu]
            edu_pos_tags = [x[1] for x in edu]

            # make a dictionary for each EDU
            new_tree = Tree.fromstring('(text)')
            new_tree.append('{}'.format(edu_index))
            tmp_item = {"head_idx": wnum,
                        "start_idx": wnum,
                        "end_idx": wnum,
                        "nt": "text",
                        "head": edu_words,
                        "hpos": edu_pos_tags,
                        "tree": new_tree}
            wnum += 1
            res.append(tmp_item)
        return res
def removeVerbMods(tree):
    tree_str = tsurgeon.remove_verb_modifiers(tree)
    if tree_str != '':
        new = Tree.fromstring(tree_str)
        if new != tree:
            return removeVerbMods(new)
    return tree
Ejemplo n.º 22
0
def draw_trees(treestrings):
	""" Draws pictures of each parsers-tree-string using Matplotlib. """
	for tree_string in treestrings:
		print(tree_string)
		sentence = Tree.fromstring(tree_string)
		sentence.draw()

	return ''
def main(tree_file1, tree_file2):
    same = 0
    different = 0
    for line1, line2 in izip(tree_file1, tree_file2):
        try:
            tree1 = Tree.fromstring(line1)
            tree2 = Tree.fromstring(line2)
            d = tree_diff(tree1, tree2)
            if d:
                different += 1
                print tree1
                print tree2
            else: same += 1
        except Exception, e:
            print e
            print line1
            print line2
Ejemplo n.º 24
0
    def test_parse_no_parse_returns_flat(self):
        t = Tree.fromstring(
            """
                (S
                    (NP (Det el) (Noun gato))
                    (VP (Verb come) (NP (Noun pescado) (Adj crudo)))
                )
            """)
        model = UPCFG([t], start='S')

        sent = 'gato el come pescado crudo'.split()
        tags = 'Noun Det Verb Noun Adj'.split()
        tagged_sent = list(zip(sent, tags))
        tree = model.parse(tagged_sent)

        tree2 = Tree.fromstring("(S (Noun gato) (Det el) (Verb come) (Noun pescado) (Adj crudo))")
        self.assertEqual(tree, tree2)
Ejemplo n.º 25
0
Archivo: TTree.py Proyecto: tuur/STPS
def tuples_to_tree(tuples):
    tups = list(tuples)
    if tuples==set([]):
        return TTree('(_ empty)')
    t_init = TTree("("+str(tups[0][0])+' '+str(tups[0][1])+")")
    for tup in tups:
        add_proj_tree(t_init,tuple_to_tree(tup))
    return TTree(str(nltktree.fromstring(str(t_init))))
def movePP(tree):
    # Temporary condition
    if type(tree) == str:
        pass
    moved_pp_treestr = tsurgeon.moveLeadingPP(tree)
    if moved_pp_treestr != '':
        return Tree.fromstring(moved_pp_treestr)
    pass
Ejemplo n.º 27
0
    def test_get_gold_spans_correctly_extracts_spans(self):
        ptb_reader = PennTreeBankConstituencySpanDatasetReader()
        tree = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")

        span_dict = {}
        ptb_reader._get_gold_spans(tree, 0, span_dict)
        spans = list(span_dict.items()) # pylint: disable=protected-access
        assert spans == [((0, 1), 'NP'), ((3, 4), 'NP'), ((2, 4), 'VP'), ((0, 4), 'S')]
Ejemplo n.º 28
0
def inverse_verb(main_tree_str):
    if tsurgeon.test_aux(main_tree_str):
        main_tree_str = tsurgeon.mark_aux(main_tree_str)
        main_tree_str = tsurgeon.move_aux(main_tree_str)
        main_tree = Tree.fromstring(main_tree_str)
    else:
        main_tree = move_no_aux(main_tree_str)
    return main_tree
Ejemplo n.º 29
0
def get_production_rule_by_parse_tree(parsetree):
	syntax_tree = Tree.fromstring(parsetree)

	convert_str_format = lambda string, strip_char='\'': \
		''.join( [ ch for ch in '->'.join( [ st.strip() for st in string.split('->')] ) if ch not in strip_char ] )

	production_rule = [ convert_str_format(str(pr)) for pr in syntax_tree.productions() ]

	return production_rule
Ejemplo n.º 30
0
def read_story_parses(parfile): 
    fh = open(parfile, 'r')
    lines = fh.readlines()
    fh.close()
    # skip lines that are not constituency parses
    treeList = [Tree.fromstring(line) for line in lines 
                    if 'QuestionId' not in line and
                    len(line) > 2]
    return treeList
 def _read(self, file_path):
     with open(cached_path(file_path), "r") as data_file:
         for line in data_file.readlines():
             line = line.strip("\n")
             if not line:
                 continue
             parsed_line = Tree.fromstring(line)
             sent = ' '.join(parsed_line.leaves())
             tokens = self._tokenizer.tokenize(sent)
             label = parsed_line.label()
             instance = self.text_to_instance(tokens, label)
             if instance is not None:
                 yield instance
Ejemplo n.º 32
0
def read_sst(sst_dir, split, shrink=1, char_based=False):
    dataset = []
    f = open(os.path.join(sst_dir, '{}.txt'.format(split)))
    for i, line in enumerate(f.readlines()):
        if i % shrink != 0:
            continue
        tree = Tree.fromstring(line)
        tokens = ' '.join(tree.leaves())
        tokens = split_text(normalize_text(tokens), char_based)
        label = int(tree.label())
        dataset.append((tokens, label))
    f.close()
    return dataset
Ejemplo n.º 33
0
 def walk(t):
     if type(t) == type('') or type(t[0]) == type(''):
         return
     for i in range(len(t)):
         if t[i].label() == lab:
             for j in range(len(t[i])):
                 if t[i][j].label() == sublab:
                     # Yes, it really does have to work this way!
                     t[i] = Tree.fromstring('(' + lab + ' ' + str(t[i][j]) +
                                            ')')
                     break
         if type(t[i]) != type('str'):
             walk(t[i])
Ejemplo n.º 34
0
 def load_ctb(ctb_dir, encoding="UTF-8"):
     ctb = {}
     s_pat = re.compile("<S ID=(?P<sid>\S+?)>(?P<sparse>.*?)</S>",
                        re.M | re.DOTALL)
     for file in os.listdir(ctb_dir):
         with open(os.path.join(ctb_dir, file), "r",
                   encoding=encoding) as fd:
             doc = fd.read()
         for match in s_pat.finditer(doc):
             sid = match.group("sid")
             sparse = ParseTree.fromstring(match.group("sparse"))
             ctb[sid] = sparse
     return ctb
Ejemplo n.º 35
0
 def get_relation_chomsky_syntax_tree(self, i):
     """
     Args:
             i: relation number
     Returns:
         if arg1 and arg2 have different sentence:
             {'Arg1': [arg1_parse_trees], 'Arg2', [arg2_parse_trees]}
         if arg1 and arg2 have the same sentence:
             (syntax_tree)
         if arg1 or arg2 contains more than 1 sentence:
             None
     """
     arg1_sent_id = self.get_arg_sent_id(i, 'Arg1')
     arg2_sent_id = self.get_arg_sent_id(i, 'Arg2')
     if len(arg1_sent_id) == len(arg2_sent_id) == 1:
         # SS case
         if arg1_sent_id[0] == arg2_sent_id[0]:
             nltk_tree = Tree.fromstring(
                 self.get_parse_tree(self.parse_data[i]['DocID'],
                                     arg1_sent_id[0]))
             nltk_tree.chomsky_normal_form()
             chomsky_tree = str(nltk_tree)
             return Syntax_tree(chomsky_tree)
         # PS case
         elif arg1_sent_id[0] < arg2_sent_id[0]:
             nltk_arg1_tree = Tree.fromstring(
                 self.get_parse_tree(self.parse_data[i]['DocID'],
                                     arg1_sent_id[0]))
             nltk_arg2_tree = Tree.fromstring(
                 self.get_parse_tree(self.parse_data[i]['DocID'],
                                     arg2_sent_id[0]))
             nltk_arg1_tree.chomsky_normal_form()
             nltk_arg2_tree.chomsky_normal_form()
             chomsky_arg1_tree = str(nltk_arg1_tree)
             chomsky_arg2_tree = str(nltk_arg2_tree)
             return {'Arg1': Syntax_tree(chomsky_arg1_tree), \
                     'Arg2': Syntax_tree(chomsky_arg2_tree)  }
     else:
         return None
def clausal_info_extract_from_string(parse_tree_str):
    try:
        parse_tree = Tree.fromstring(parse_tree_str)
        return clausal_info_extract(parse_tree)
    except:
        print("\nERROR IN NLTK PARSE-TREE\n", parse_tree_str,
              parse_tree.flatten())
        mb.showwarning(
            title='ERROR IN PARSE-TREE',
            message=
            "There was an error in NLTK parsing of the sentence tree displayed in command line.\n\nSearch in your document for the words displayed in command line, edit your document for characters that may lead to this error, and try again."
        )
        return
Ejemplo n.º 37
0
 def test_getVerbtrees(self):
     t = Tree.fromstring(
         "(S(NP (DT The@$/$@1) (NN teacher@$/$@2))(VP (VBZ likes@$/$@3) (NP (NNS apples@$/$@4)))(. .@$/$@5))"
     )
     verb = []
     obj = []
     ttriples = []
     triple_extraction.getVerbtrees(t, verb, obj, ttriples)
     if "likes@$/$@3" == obj[0].split(";")[0]:
         print("getVerbtrees - OK")
     else:
         print("getVerbtrees - ERROR")
     self.assertEqual(obj[0].split(";")[0], "likes@$/$@3")
def sst_reader(src_filename, class_func=None, include_subtrees=True):
    if class_func is None:
        class_func = lambda x: x
    with open(src_filename) as f:
        for line in f:
            tree = Tree.fromstring(line)
            if include_subtrees:
                for subtree in tree.subtrees():
                    label = class_func(subtree.label())
                    yield (_sst_detokenize(subtree), label)
            else:
                label = class_func(tree.label())
                yield (_sst_detokenize(tree), label)
Ejemplo n.º 39
0
def deleaf(parse_string):
    tree = Tree.fromstring(parse_string.strip(), read_leaf=lambda s: "")
    for sub in tree.subtrees():
        for n, child in enumerate(sub):
            if isinstance(child, str):
                continue
            if len(list(child.subtrees(
                    filter=lambda x: x.label() == '-NONE-'))) == len(
                        child.leaves()):
                del sub[n]
    oneline = tree.pformat(margin=10000, parens=[" ( ", " ) "])
    oneline = re.sub(' +', ' ', oneline)
    return oneline
def stanfordparserdemo(sentnece):
    text = (sentnece)

    output = nlp.annotate(text,
                          properties={
                              'annotators':
                              'tokenize,ssplit,pos,depparse,parse',
                              'outputFormat': 'json'
                          })

    print "\n------------Stanford Parser Parseing Result------------"
    parsetree = output['sentences'][0]['parse']
    print "\n------parsing------\n"
    print parsetree
    print "\n------ Words inside NP ------\n"
    for i in Tree.fromstring(parsetree).subtrees():
        if i.label() == 'NP':
            print i.leaves(), i.label()
    print "\n------ Words inside NP with POS tags ------\n"
    for i in Tree.fromstring(parsetree).subtrees():
        if i.label() == 'NP':
            print i
def generate_partial(segment):
    """
    短语树拆分
    :param segment:
    :return:
    """
    pos_root = BASE_DIR + "/vendor/dataset/stanford/stanford-corenlp-full-2017-06-09/"
    par_model = pos_root + "models/lexparser/chinesePCFG.ser.gz"
    opttype = 'penn'
    parser = StanfordParser(par_model, pos_root, opttype)
    par_tag = parser.tagfile(segment)
    tree = Tree.fromstring(par_tag)
    return tree
Ejemplo n.º 42
0
 def fromtree(cls, data, fields, subtrees=False):
     warnings.warn('Example class will be retired in the 0.8.0 release and moved to torchtext.legacy. Please see 0.7.0 release notes for further information.', UserWarning)
     try:
         from nltk.tree import Tree
     except ImportError:
         print("Please install NLTK. "
               "See the docs at http://nltk.org for more information.")
         raise
     tree = Tree.fromstring(data)
     if subtrees:
         return [cls.fromlist(
             [' '.join(t.leaves()), t.label()], fields) for t in tree.subtrees()]
     return cls.fromlist([' '.join(tree.leaves()), tree.label()], fields)
Ejemplo n.º 43
0
    def test_calc_frazier_score(self):
        sent = "Colorless green ideas sleep furiously"
        parse = [
            '( (S (NP (NNP Colorless) (JJ green) (NNS ideas)) (VP (VBP sleep) (ADVP (RB furiously)))) )'
        ]

        expected = 4.5
        actual = calc_frazier_score(Tree.fromstring(parse[0]), 0, '')
        self.assertEqual(expected, actual)

        expected = -1
        actual = calc_frazier_score("Hi!", 0, '')
        self.assertEqual(expected, actual)
Ejemplo n.º 44
0
    def _pre_processing(self):
        all_data = self.read_json(
            path.join(self.data_dir, 'train.stanford.json'))
        gram2count = defaultdict(int)
        pos_tag2count = defaultdict(int)
        chunk_tag2count = defaultdict(int)
        dep_tag2count = defaultdict(int)

        for data in all_data:
            print(type(data))
            sentences_list = data['sentences']
            for sentence_l in sentences_list:

                tokens = sentence_l['tokens']
                for token in tokens:
                    gram2count[token['originalText']] += 1
                    pos_tag2count[token['pos']] += 1
                    pos_tag2count[token['originalText'] + '_' +
                                  token['pos']] += 1
                deparse = sentence_l['basicDependencies']
                for word in deparse:
                    dep_tag2count[word['dep']] += 1
                    dep_tag2count[word['dependentGloss'] + '_' +
                                  word['dep']] += 1

                coparse = Tree.fromstring(sentence_l['parse'])
                for s in coparse.subtrees(lambda t: t.label() in chunk_pos):
                    leaves = s.leaves()
                    node = s.label()
                    chunk_tag2count[node] += 1
                    for leaf in leaves:
                        chunk_tag2count[leaf + '_' + node] += 1
                chunk_tag2count['ROOT'] = 100

        print('feature stat')
        print('# of gram: %d' % len(gram2count))
        print('# of pos: %d' % len(pos_tag2count))
        print('# of chunk_tag: %d' % len(chunk_tag2count))
        print('# of dep: %d' % len(dep_tag2count))
        feature2id = {
            'gram2count': gram2count,
            'pos_tag2count': pos_tag2count,
            'chunk_tag2count': chunk_tag2count,
            'dep_tag2count': dep_tag2count
        }

        with open(path.join(self.data_dir, 'feature2count.json'),
                  'w',
                  encoding='utf8') as f:
            json.dump(feature2id, f, ensure_ascii=False)
            f.write('\n')
Ejemplo n.º 45
0
    def why_answer(self, question, relevant):
        #Get all nouns in the question
        Q_nouns = [tup[0] for tup in self.nlp.pos(question) if tup[1][0] == 'N']

        #Find all phrases and sub phrases from the relevent sentence
        r_out = Tree.fromstring(self.nlp.parse(relevant))
        phrase_ans = []
        phrases = self.find_S(r_out)

        #For each phrase, find the NP and VP and parse out the nouns in the NP
        for tree in phrases:
            #print(tree.label())
            #print(tree.leaves())
            found = False
            for subtree in tree:
                #print(subtree.label())
                #print(subtree.leaves())
                if subtree.label() == 'NP':
                    nounP = " ".join(subtree.leaves())
                    R_nouns = [tup[0] for tup in self.nlp.pos(nounP) if tup[1][0] == 'N']
                    for noun in R_nouns:
                        #If nouns in the subphrase are not in the question, we are in the wrong phrase, append wrong phrase and skip the current phrase
                        if noun not in Q_nouns:
                            phrase_ans.append('WrongPhrase')
                            break
                verbP = ''
                if subtree.label() == 'VP':
                    verbP = " " .join(subtree.leaves())
                #If we find an instance of a "Why" word, find the position and return the string starting from that position.
                for word in self.why_words:
                    if word in verbP:
                        found = True
                        location = verbP.find(word)
                        verbP = verbP[location:]
                        phrase_ans.append(verbP.capitalize())
                        break

            #If there was no phrase, append WrongPhrase
            if found == False:
                phrase_ans.append('WrongPhrase')

        ans = ""
        #Check all the answers in phrase answers, the correct answer is the one that is not from a Wrong Phrase
        for answer in phrase_ans:
            if answer != 'WrongPhrase':
                ans = answer + '.'

        if ans == "":
            return ""
        else:
            return ans
Ejemplo n.º 46
0
 def test_get_gold_spans_correctly_extracts_spans_with_nested_labels(self):
     ptb_reader = PennTreeBankConstituencySpanDatasetReader()
     # Here we have a parse with several nested labels - particularly the (WHNP (WHNP (WP What)))
     # fragment. These should be concatenated into a single label by get_gold_spans.
     tree = Tree.fromstring("""
         (S
     (`` ``)
     (S-TPC
     (NP-SBJ (PRP We))
     (VP
         (VBP have)
         (S
         (VP
             (TO to)
             (VP
             (VP
                 (VB clear)
                 (PRT (RP up))
                 (NP (DT these) (NNS issues)))
             (CC and)
             (VP
                 (VB find)
                 (PRT (RP out))
                 (SBAR-NOM
                 (WHNP (WHNP (WP what)))
                 (S
                     (VP
                     (VBZ is)
                     (ADJP-PRD (JJ present))
                     (SBAR
                         (WHNP (WDT that))
                         (S
                         (VP
                             (VBZ is)
                             (VP
                             (VBG creating)
                             (NP (JJ artificial) (NN volatility)))))))))))))))
     (, ,)
     ('' '')
     (NP-SBJ (NNP Mr.) (NNP Fisher))
     (VP (VBD said))
     (. .))
     """)
     span_dict = {}
     ptb_reader._strip_functional_tags(tree) # pylint: disable=protected-access
     ptb_reader._get_gold_spans(tree, 0, span_dict) # pylint: disable=protected-access
     assert span_dict == {(1, 1): 'NP', (5, 5): 'PRT', (6, 7): 'NP', (4, 7): 'VP', (10, 10): 'PRT',
                          (11, 11): 'WHNP-WHNP', (13, 13): 'ADJP', (14, 14): 'WHNP', (17, 18): 'NP',
                          (16, 18): 'VP', (15, 18): 'S-VP', (14, 18): 'SBAR', (12, 18): 'S-VP',
                          (11, 18): 'SBAR', (9, 18): 'VP', (4, 18): 'VP', (3, 18): 'S-VP',
                          (2, 18): 'VP', (1, 18): 'S', (21, 22): 'NP', (23, 23): 'VP', (0, 24): 'S'}
def extractNonResMod(tree):
    subject = tsurgeon.findSubject(tree)
    if not subject:
        return
    subj_tree = Tree.fromstring(subject)
    tokens = subj_tree.leaves()
    parts = ' '.join(tokens).split(',')
    main_subject = parts[0]
    if len(parts) > 1 and parts[1] != '':
        phrase_type = getTag(parts[1].strip(), subj_tree)
        # check if it is an appositive
        if phrase_type == 'NP':
            # adding 'is' temporarily - might be able to get inflection correct
            # by examining get_top_questions verb.
            appos = parts[1].split()
            subj = main_subject.split()
            appos_tree = None
            newsubj_tree = None
            for sub in subj_tree.subtrees():
                if sub.leaves() == appos and (appos_tree == None
                                              or len(sub) > len(appos_tree)):
                    appos_tree = str(sub)
                elif sub.leaves() == subj and (newsubj_tree == None or
                                               len(sub) > len(newsubj_tree)):
                    newsubj_tree = str(sub)
            new_treestr = "(ROOT (S %s (VP (VBZ is) %s) (. .)))" % (
                newsubj_tree, appos_tree)
            new_tree = Tree.fromstring(new_treestr)
            return new_tree
        # check if it is a relative clause
        elif phrase_type == 'SBAR':
            # CONSTRAINTS:
            # fails for relative clauses with adjunct gaps
            # assumes we don't have a subordinate clause - need case for this
            substitution = [main_subject.rstrip()] + parts[1].split()[1:]
            sentence = ' '.join(substitution).rstrip() + '.'
            return sentence
    pass
Ejemplo n.º 48
0
 def fromtree(cls, data, fields, subtrees=False):
     try:
         from nltk.tree import Tree
     except ImportError:
         print('''Please install NLTK:
 $ pip install nltk''')
         raise
     tree = Tree.fromstring(data)
     if subtrees:
         return [
             cls.fromlist([t.leaves(), t.label()], fields)
             for t in tree.subtrees()
         ]
     return cls.fromlist([tree.leaves(), tree.label()], fields)
Ejemplo n.º 49
0
def getParseTreeAnalysis(output):
    parse_tree = output['sentences'][0]['parse']
    tree = ParentedTree.convert(Tree.fromstring(parse_tree))
    #tree.pretty_print()
    rel2 = dict()
    nouns = list()
    for s in tree.subtrees(lambda tree: tree.label().startswith('NN') or tree.
                           label() == 'PRP'):
        rel2.setdefault(s[0], [])
        nouns.append(s)
    for s in nouns:
        values = find_attributes(s, 1, [])
        rel2[s[0]] = values
    print rel2
Ejemplo n.º 50
0
def X_tree():
    vocab = ["1", "+", "2", "$UNK"]
    train = [
        "(odd 1)",
        "(even 2)",
        "(odd (pdd 1))",
        "(even (even 2))",
        "(even (odd 1) (neutral (neutral +) (odd 1)))",
        "(odd (odd 1) (neutral (neutral +) (even 2)))",
        "(odd (even 2) (neutral (neutral +) (odd 1)))",
        "(even (even 2) (neutral (neutral +) (even 2)))",
        "(even (odd 1) (neutralB (neutral +) (odd (odd 1) (neutral (neutral +) (even 2)))))"]
    X_train = [Tree.fromstring(x) for x in train]
    return X_train, vocab
Ejemplo n.º 51
0
    def _read(self, file_path):
        with open(file_path) as in_file:
            for line in in_file.readlines():
                if not line:
                    continue

                tree = Tree.fromstring(line)
                sentiment = tree.label()
                if self._binary_sentiment:
                    sentiment = _binarize_sentiment(sentiment)
                    if sentiment is None:
                        continue

                yield self.text_to_instance(tree.leaves(), sentiment)
Ejemplo n.º 52
0
	def __spilt_sentence(self,sentence): 
		nlp = StanfordCoreNLP('http://localhost', port=12331)
		 # 句法分析树
		rootTree = Tree.fromstring(nlp.parse(sentence))
		nlp.close()
		# 这里可以获得所有的短语集
		subtrees = rootTree.subtrees()
		phraseSet = set()
		for t in subtrees:
			tleaves = t.leaves()
			if len(tleaves) < 4:
				ele = " ".join(tleaves)
				phraseSet.add(ele)
		return phraseSet
Ejemplo n.º 53
0
    def create(self, corenlp):
        ''' 
			parses the raw string review into sentences then tokens as well as a constituency parse 
			also intializes all the variables
		'''

        assert corenlp is not None

        output = corenlp.annotate(
            self.review_string,
            properties={
                'annotators':
                'tokenize, ssplit, parse',
                'outputFormat':
                'json',
                'parse.model':
                'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz'
            })

        if (type(output) is str
            ):  #TypeError: eval() arg 1 must be a string, bytes or code object
            output = eval(output)
        self.size = len(output['sentences'])

        #organize into 1D and MD(multi-dimensional --> Tree)
        for i in range(self.size):
            tokenized_1D = [
                token_json['word']
                for token_json in output['sentences'][i]['tokens']
            ]
            self.list_tokenized_1D.append(tokenized_1D)

            parsetree = Tree.fromstring(output['sentences'][i]['parse'])
            self.list_NLTK_trees.append(parsetree)  #NLTK Tree objects
            self.list_token_trees.append(map_token_tree(parsetree))  #MD tokens
            self.list_tree_indices.append(
                getTreeIndices(self.list_token_trees[i]))
            self.list_valence_1D.append([])
            self.list_valence_trees.append([])

        #save original as string json
        self.orig_list_token_trees = json.dumps(
            {"Tree": self.list_token_trees})
        self.orig_list_tokenized_1D = json.dumps(
            {"1D": self.list_tokenized_1D})
        self.orig_list_tree_indices = json.dumps(
            {"Tree Indices": self.list_tree_indices})
        self.orig_list_NLTK_trees = [
            tree.copy(deep=True) for tree in self.list_NLTK_trees
        ]
Ejemplo n.º 54
0
 def process_data_file(self, file_path):
     cnt = 0
     with open(file_path, "r") as f:
         for line in f:
             line = line.strip()
             tree = Tree.fromstring(line)
             label = self.label_level(tree.label())
             if label != "neutral":
                 assert len(self.X) == len(self.Y)
                 idx = len(self.X)
                 self.X[idx] = " ".join(tree.leaves())
                 self.Y[idx] = label
                 cnt += 1
     return cnt
Ejemplo n.º 55
0
def extract_phrase(tree_str, label):
    phrases = []
    trees = Tree.fromstring(tree_str)
    for tree in trees:
        #print(tree)
        #print("#########################")
        for subtree in tree.subtrees():
            #print(subtree)
            if subtree.label() == label:
                t = subtree
                t = ' '.join(t.leaves())
                phrases.append(t)

    return phrases
Ejemplo n.º 56
0
 def get_raw_answer(self, question, answer):
     q_tree = sNLP.parse(question)
     q_tree = Tree.fromstring(str(q_tree))
     a_tree = sNLP.parse(Binary.main(answer))
     a_tree = Tree.fromstring(str(a_tree))
     # res = True
     (q_top_level_structure,
      q_parse_by_structure) = self.get_top_level_structure(q_tree)
     (a_top_level_structure,
      a_parse_by_structure) = self.get_top_level_structure(a_tree)
     for i in range(0, len(q_top_level_structure)):
         q_label = q_top_level_structure[i]
         if q_label in a_top_level_structure:
             a_index = a_top_level_structure.index(q_label)
         else:
             print("label not found")
             return False
         # print "Result:!!!!!", self.partial_matching(q_parse_by_structure[i], a_parse_by_structure[a_index])
         if not self.partial_matching(q_parse_by_structure[i],
                                      a_parse_by_structure[a_index]):
             # print("struct:", q_parse_by_structure[i], a_parse_by_structure[a_index])
             return False
     return True
Ejemplo n.º 57
0
def getspan_fromtree(t: 'str of tree') \
        -> 'span of each tag:dictionary{tag_num:(pos,start,end)})':
    tree = Tree.fromstring(t)
    span = {}
    tag_num = 1
    pl = 0  #単語の位置
    for i in tree.subtrees():  #部分木すべてについて
        pl = pl + tree.leaves()[pl:].index(
            i.leaves()[0])  #単語の見る位置を部分木の初めの単語の位置に変更
        start = pl + tree.leaves()[pl:].index(i.leaves()[0])  #タグが含む範囲の初めの位置
        end = start + len(i.leaves())  #タグが含む範囲の終わりの位置(初めの位置+部分木の葉の数)
        span[tag_num] = (i.label(), start + 1, end)
        tag_num += 1
    return (span)
Ejemplo n.º 58
0
 def main(self, text, parser):
     print(text)
     tree = parser.parse(text)
     tree = Tree.fromstring(str(tree))
     # print tree
     if not self.is_why(tree):
         print("It could not be converted to why question.")
     (top_level_structure, parse_by_structure) = self.remove_SBAR(tree)
     # print top_level_structure
     # print parse_by_structure
     sent = " ".join(parse_by_structure)
     sent = Binary.main(sent, parser)
     print("Why " + sent)
     return ("Why " + sent)
Ejemplo n.º 59
0
 def fromtree(cls, data, fields, subtrees=False):
     try:
         from nltk.tree import Tree
     except ImportError:
         print("Please install NLTK. "
               "See the docs at http://nltk.org for more information.")
         raise
     tree = Tree.fromstring(data)
     if subtrees:
         return [
             cls.fromlist([t.leaves(), t.label()], fields)
             for t in tree.subtrees()
         ]
     return cls.fromlist([tree.leaves(), tree.label()], fields)
Ejemplo n.º 60
0
 def main(self, text, NE, parser):
     tree = parser.parse(text)
     tree = Tree.fromstring(str(tree))
     (top_level_structure,
      parse_by_structure) = Binary.get_top_level_structure(tree)
     np_index = top_level_structure.index("NP")
     if self.is_who(parse_by_structure[np_index], NE):
         parse_by_structure[np_index] = "who"
     else:
         parse_by_structure[np_index] = "what"
     parse_by_structure[-1] = "?"
     sent = " ".join(parse_by_structure)
     print(sent)
     return sent