Beispiel #1
0
def test_sentences():
    sentences = []
    original = []
    reader = BracketParseCorpusReader('', 'test sentences parsed 2.txt')
    for sent in reader.parsed_sents():
        sent = ParentedTree.convert(sent)
        #see if words we care about are in sentence
        if list(set(sent.leaves()) & set(be)) != []:
            copWords = list(set(sent.leaves()) & set(be))
            for word in copWords:
                nouns = []
                negative = False
                #End goal: get [[nouns], verb, [adjectives], isNegated] for each verb in list
                #Do so by first finding the verb of interest and its position in the tree
                position = sent.leaves().index(word)
                treeposition = sent.leaf_treeposition(position)
                #Want the VP to find the Adjective Predicate which is its child
                newTree = sent[treeposition[:-2]]
                #Search for Adjective(s) below Adjective Predicate
                adj = nltk_tgrep.tgrep_nodes(newTree, 'JJ|VBN|VBG > ADJP')
                if 'not' in newTree.leaves() or 'n\'t' in newTree.leaves():
                    negative = True
                vb = sent[treeposition[:-1]]
                # To find the relevant Noun Phrase, we go up the tree until reaching the lowest sentence node, then back down to NP-SBJ.*
                s = sent[treeposition[:-1]].parent()
                while ('S' not in s.label()):
                    s = s.parent()
                    try:
                        s.label()
                    except AttributeError:
                        break
                #Move one level above VP to find Subject of Verb
                Ns = nltk_tgrep.tgrep_nodes(s, 'NP')
                for N in Ns:
                    nouns = nouns + nltk_tgrep.tgrep_nodes(
                        N, 'NN|PRP|NNS|EX|WD|NNP')
                #Moving from lists of parented trees to lists of words
                noun = [x.leaves() for x in nouns]
                noun = [single for [single] in noun]
                noun = list(set(noun))
                adj = [i.leaves() for i in adj]
                adj = [single for [single] in adj]
                adj = list(set(adj))
                #Because our test sentences are all simple, can do this
                adjp = nltk_tgrep.tgrep_nodes(sent, 'ADJP')[0].leaves()
                np = nltk_tgrep.tgrep_nodes(sent, 'NP')[0].leaves()
                sentences.append([noun, vb, adj, negative, np, adjp])
                original.append(" ".join(sent.leaves()))
    return rewrite_sentences(sentences), original
 def _count_filter(self, tgrep_pattern, filt):
     # Applies a filter to all the nodes matching a tgrep pattern in a tree and counts those matching the filter.
     # All matches from each tree is summed and returned.
     return sum(
         sum(1
             for _ in filter(filt, tgrep.tgrep_nodes(tree, tgrep_pattern)))
         for tree in self.trees)
Beispiel #3
0
def count_occurrences(tree,pattern,constituent_filter):
    """Take a tree, a desired search pattern and a filter and return a count
    Args:tree (ptree): a ParentedTree of a sentence, pattern (str): a pattern to search for, constituent(lambda filter): a filter condition based on desired properties
    Returns: count_of_constituents
    We take a tree and search for all matches in it using tgrep(tree grep) then we remove all matches that dont match out fiter. For S, VP, NP we set the filter to None, for IVP and DVP we set the appropirate conditions 
    """
    matches = tgrep.tgrep_nodes(tree, pattern) #find all items in tree that match our searc pattern
    constituents = list(filter(constituent_filter, matches)) #remove whatever doesnt match our filter
    return len(constituents)
Beispiel #4
0
def terminal_nodes(tree_str, pattern):
    """
    tree_str: the string of a phrase structure parse tree
    pattern: the tgrep pattern
    return: a list of string of the terminal nodes (leaves)
    """
    assert isinstance(tree_str, str)
    assert isinstance(pattern, str)
    try:
        tree = ParentedTree.fromstring(tree_str)
    except Exception as e:
        print('error in constructing tree')
        raise
    else:
        res = nltk_tgrep.tgrep_nodes(tree, pattern)
        res_str = [' '.join(t.leaves()) for t in res]
        return res_str
def _tgrep_count_and_lengths(tree, pattern):
    """Number and lengths of constituent"""
    result = nltk_tgrep.tgrep_nodes(tree, pattern)
    result = [r for r in result if isinstance(r, nltk.tree.ParentedTree)]
    lengths = [len(r.leaves()) for r in result]
    return len(result), lengths
Beispiel #6
0
    def aplicar_regras_sint(self, lista, arvore):
        '''Aplica regras sintáticas na árvore.
		'''
        p_arvore = ParentedTree.convert(arvore)
        self.adaptar_regras_morfo_arvore(lista, p_arvore)
        for morpho in self.__root.findall('syntactic'):
            for rule in morpho.findall('rule'):  # procura a tag rule
                nome_regra = self.corrigir_nome_regra(rule.get('name'))
                regra = self.separar_regra(nome_regra)
                node_pai = tgrep_nodes(p_arvore, regra[0], search_leaves=False)
                if node_pai and rule.find('active').text == "true":
                    node_pai = node_pai[0]
                    node_regra = tgrep_nodes(node_pai,
                                             regra[1].replace('$', '..'),
                                             search_leaves=False)
                    if node_regra:
                        node_esq_pos = tgrep_positions(node_pai,
                                                       regra[1],
                                                       search_leaves=False)
                        node_dir_pos = tgrep_positions(node_pai,
                                                       regra[2],
                                                       search_leaves=False)
                        if node_esq_pos and node_dir_pos:
                            #print "REGRA SINTÁTICA ENCONTRADA: " + rule.get('name')
                            nodes_positions = node_esq_pos + node_dir_pos
                            self.count = -1
                            self.has_rule = True

                            count_temp = -1
                            for classe in rule.findall('class'):
                                count_temp += 1
                                leaves = node_pai[
                                    nodes_positions[count_temp]].leaves()
                                token = filter(None, leaves)[0]
                                specific = classe.find('specific')
                                if specific is not None:
                                    result_specific = self.__especificos[
                                        specific.text](token)
                                    if result_specific is False:
                                        self.has_rule = False

                            if self.has_rule is False:
                                #print "REGRA SINTÁTICA " + rule.get('name') + " INVÁLIDA. PROCURANDO OUTRA..."
                                break

                            nodes_deleted = []

                            for classe in rule.iter('class'):
                                action = classe.find('action')
                                newprop = classe.find('newprop')
                                title_text = classe.find('title').text

                                self.count += 1

                                if action is not None:
                                    action_text = action.text

                                    if action_text == "remove":
                                        pos_del = nodes_positions[self.count]
                                        nodes_deleted.append(node_pai[pos_del])
                                        node_pai[pos_del] = None
                                        continue

                                    elif action_text == "invert":
                                        aux1 = node_pai[nodes_positions[
                                            self.count]]
                                        aux2 = node_pai[nodes_positions[
                                            self.count + 1]]
                                        node_pai[nodes_positions[
                                            self.count]] = None
                                        node_pai[nodes_positions[self.count +
                                                                 1]] = None
                                        node_pai[nodes_positions[
                                            self.count]] = aux2
                                        node_pai[nodes_positions[self.count +
                                                                 1]] = aux1

                                    elif action_text == "concate_intens":
                                        if title_text == "ADV-R":
                                            node_prev = nodes_deleted.pop()
                                            label_prev = node_prev[0][0].label(
                                            )
                                            token_prev = filter(
                                                None, node_prev).leaves()[0]
                                            token = filter(
                                                None, node_pai[nodes_positions[
                                                    count_temp]].leaves())[0]
                                            specific = classe.find('specific')
                                            result_specific = self.get_adv_intensidade(
                                                token)
                                            token_concate = result_specific + "_" + token_prev
                                            node_pai[
                                                nodes_positions[count_temp]][
                                                    0][0][0] = token_concate
                                            newprop = ""
                                            if label_prev[:-2] == "VB":
                                                newprop = "VBi"
                                            elif label_prev[:-3] == "ADJ":
                                                newprop = "ADJi"
                                            node_pai[nodes_positions[
                                                count_temp]][0][0].set_label(
                                                    newprop)

                                        else:
                                            token_prev = filter(
                                                None, nodes_deleted.pop()
                                            ).leaves()[0]
                                            token_prev_specific = self.get_adv_intensidade(
                                                token_prev)
                                            token = filter(
                                                None, node_pai[nodes_positions[
                                                    count_temp]].leaves())[0]
                                            token_concate = token_prev_specific + "_" + token
                                            node_pai[
                                                nodes_positions[count_temp]][
                                                    0][0][0] = token_concate
                                            node_pai[nodes_positions[
                                                count_temp]][0][0].set_label(
                                                    newprop.text)

                                    elif action_text == "concate_neg":
                                        token = filter(
                                            None, node_pai[nodes_positions[
                                                count_temp]].leaves())[0]
                                        token_concate = token + "_não"
                                        node_pai[nodes_positions[count_temp]][
                                            0][0][0] = token_concate
                                        # TODO: PRECISA ADD NEWPROP?

                                if newprop is not None:
                                    node_pai[nodes_positions[
                                        self.count]].set_label(newprop.text)

                                break
        return self.converter_arv_para_lista(p_arvore)
Beispiel #7
0
for sent in reader.parsed_sents():
    sent = ParentedTree.convert(sent)
    #see if words we care about are in sentence
    if list(set(sent.leaves()) & set(allv)) != []:
        copWords = list(set(sent.leaves()) & set(allv))
        for word in copWords:
            nouns = []
            #End goal: get [[nouns], verb, [adjectives]] for each verb in list
            #Do so by first finding the verb of interest and its position in the tree
            position = sent.leaves().index(word)
            treeposition = sent.leaf_treeposition(position)
            #Want the VP to find the Adjective Predicate which is its child
            newTree = sent[treeposition[:-2]]
            #Search for Adjective(s) below Adjective Predicate
            adj = nltk_tgrep.tgrep_nodes(newTree, 'JJ|VBN|VBG|JJR > ADJP-PRD')
            vb = sent[treeposition[:-1]]
            # To find the relevant Noun Phrase, we go up the tree until reaching the lowest sentence node, then back down to NP-SBJ.*
            s = sent[treeposition[:-1]].parent()
            while ('S' not in s.label()):
                s = s.parent()
                try:
                    s.label()
                except AttributeError:
                    break
            #Move one level above VP to find Subject of Verb
            Ns = nltk_tgrep.tgrep_nodes(s, 'NP-SBJ|NP-SBJ-1|NP-SBJ-2')
            for N in Ns:
                nouns = nouns + nltk_tgrep.tgrep_nodes(N, 'NN|PRP|NNS|EX|WDT')
            #Moving from lists of parented trees to lists of words
            noun = [x.leaves() for x in nouns]
 def _count(self, tgrep_pattern):
     # Sums the number of nodes matching a tgrep pattern in each tree.
     return sum(
         len(tgrep.tgrep_nodes(tree, tgrep_pattern)) for tree in self.trees)