Example #1
0
def process_sentence(word_array, results):
    txt = ' '.join(word_array).replace('\n', '')
    tree = next(parser.raw_parse(txt))
    tree = ParentedTree.convert(tree)
    leaf_values = tree.leaves()

    if len(leaf_values) != len(word_array):
        print('This may not happen')

    token_count = 0
    for token in leaf_values:
        token_count += 1
        leaf_index = leaf_values.index(token)
        tree_location = tree.leaf_treeposition(leaf_index)
        depth = len(tree_location)
        parent = tree[tree_location[0:(depth - 1)]]
        trace, POS_stanf = compute_total_trace(parent)
        df = pd.DataFrame([{
            'trace': trace,
            'POS_stanf': POS_stanf,
            'cd_idx': story,
            'sentence_count': sentence,
            'token_count': token_count,
            'token': token
        }])
        results = results.append(df, ignore_index=True)
    return str(tree), results
Example #2
0
def getSentParses(sentence):

    if type(sentence) != str or len(sentence.split()) <= 1: return []

    #Convert sentence into Stanford-parsed tree
    sentence = ParentedTree.convert(list(parser.raw_parse(sentence))[0])

    #Split sentences if they contain multiple full sentences separated by ';', etc.
    sentences = []
    if (sentence[0].label() == 'S') and (sentence[0,0].label() == 'S'):
        for i in range(len(sentence[0])):
            sentences += [sentence[0,i]]
    else:
        for i in range(len(sentence)):
            sentences += [sentence[i]]

    #Obtain desired tuple relations
    parsedSents = []
    for sentence in sentences:
        print "Current subsentence", sentence.leaves()
        parsedSents += [getPrepParse(sentence)]
        parsedSents += [getSVBroadParse(sentence)]

    #Basic stupid coreferencing
    defaultSet = False
    for parsedSent in parsedSents:
        if len(parsedSent) == 0: continue
        if parsedSent[1].label() == 'NP' and parsedSent[1][0].label() != 'PRP':
            default = parsedSent[1]
            defaultSet = True
        if parsedSent[1].label() == 'NP' and parsedSent[1][0].label() == 'PRP' and defaultSet:
            parsedSent[1] = default

    return parsedSents
Example #3
0
    def gerar_no(self, s):
        '''Gera um ParentedTree do NLTK apartir da string recebida.
		'''
        all_ptrees = []
        t_string = '(' + s[1] + ' ' + s[0] + ')'
        ptree = ParentedTree.convert(Tree.fromstring(t_string))
        all_ptrees.extend(t for t in ptree.subtrees() if isinstance(t, Tree))
        return ptree
Example #4
0
 def ptph(self, rel):
     ptree = ParentedTree.convert(rel.parse_tree)
     # print(ptree.pprint())
     arg1_tokens = rel.get_arg1_tokens()
     arg1_words = self.get_words(arg1_tokens)
     arg2_tokens = rel.get_arg2_tokens()
     arg2_words = self.get_words(arg2_tokens)
     return "ptp={0}".format(self.find_path(ptree, arg1_words, arg2_words))
Example #5
0
    def aplicar_regras_sint(self, lista, arvore):
        '''Aplica regras sintáticas na árvore.
		'''
        p_arvore = ParentedTree.convert(arvore)
        self.adaptar_regras_morfo_arvore(lista, p_arvore)
        for morpho in self.__root.findall('syntactic'):
            for rule in morpho.findall('rule'):  # procura a tag rule
                nome_regra = self.corrigir_nome_regra(rule.get('name'))
                regra = self.separar_regra(nome_regra)
                node_pai = tgrep_nodes(p_arvore, regra[0], search_leaves=False)
                if node_pai and rule.find('active').text == "true":
                    node_pai = node_pai[0]
                    node_regra = tgrep_nodes(node_pai,
                                             regra[1].replace('$', '..'),
                                             search_leaves=False)
                    if node_regra:
                        node_esq_pos = tgrep_positions(node_pai,
                                                       regra[1],
                                                       search_leaves=False)
                        node_dir_pos = tgrep_positions(node_pai,
                                                       regra[2],
                                                       search_leaves=False)
                        if node_esq_pos and node_dir_pos:
                            #print "REGRA SINTÁTICA ENCONTRADA: " + rule.get('name')
                            nodes_positions = node_esq_pos + node_dir_pos
                            self.count = -1
                            self.has_rule = True

                            count_temp = -1
                            for classe in rule.findall('class'):
                                count_temp += 1
                                leaves = node_pai[
                                    nodes_positions[count_temp]].leaves()
                                token = filter(None, leaves)[0]
                                specific = classe.find('specific')
                                if specific is not None:
                                    result_specific = self.__especificos[
                                        specific.text](token)
                                    if result_specific is False:
                                        self.has_rule = False

                            if self.has_rule is False:
                                #print "REGRA SINTÁTICA " + rule.get('name') + " INVÁLIDA. PROCURANDO OUTRA..."
                                break

                            nodes_deleted = []

                            for classe in rule.iter('class'):
                                action = classe.find('action')
                                newprop = classe.find('newprop')
                                title_text = classe.find('title').text

                                self.count += 1

                                if action is not None:
                                    action_text = action.text

                                    if action_text == "remove":
                                        pos_del = nodes_positions[self.count]
                                        nodes_deleted.append(node_pai[pos_del])
                                        node_pai[pos_del] = None
                                        continue

                                    elif action_text == "invert":
                                        aux1 = node_pai[nodes_positions[
                                            self.count]]
                                        aux2 = node_pai[nodes_positions[
                                            self.count + 1]]
                                        node_pai[nodes_positions[
                                            self.count]] = None
                                        node_pai[nodes_positions[self.count +
                                                                 1]] = None
                                        node_pai[nodes_positions[
                                            self.count]] = aux2
                                        node_pai[nodes_positions[self.count +
                                                                 1]] = aux1

                                    elif action_text == "concate_intens":
                                        if title_text == "ADV-R":
                                            node_prev = nodes_deleted.pop()
                                            label_prev = node_prev[0][0].label(
                                            )
                                            token_prev = filter(
                                                None, node_prev).leaves()[0]
                                            token = filter(
                                                None, node_pai[nodes_positions[
                                                    count_temp]].leaves())[0]
                                            specific = classe.find('specific')
                                            result_specific = self.get_adv_intensidade(
                                                token)
                                            token_concate = result_specific + "_" + token_prev
                                            node_pai[
                                                nodes_positions[count_temp]][
                                                    0][0][0] = token_concate
                                            newprop = ""
                                            if label_prev[:-2] == "VB":
                                                newprop = "VBi"
                                            elif label_prev[:-3] == "ADJ":
                                                newprop = "ADJi"
                                            node_pai[nodes_positions[
                                                count_temp]][0][0].set_label(
                                                    newprop)

                                        else:
                                            token_prev = filter(
                                                None, nodes_deleted.pop()
                                            ).leaves()[0]
                                            token_prev_specific = self.get_adv_intensidade(
                                                token_prev)
                                            token = filter(
                                                None, node_pai[nodes_positions[
                                                    count_temp]].leaves())[0]
                                            token_concate = token_prev_specific + "_" + token
                                            node_pai[
                                                nodes_positions[count_temp]][
                                                    0][0][0] = token_concate
                                            node_pai[nodes_positions[
                                                count_temp]][0][0].set_label(
                                                    newprop.text)

                                    elif action_text == "concate_neg":
                                        token = filter(
                                            None, node_pai[nodes_positions[
                                                count_temp]].leaves())[0]
                                        token_concate = token + "_não"
                                        node_pai[nodes_positions[count_temp]][
                                            0][0][0] = token_concate
                                        # TODO: PRECISA ADD NEWPROP?

                                if newprop is not None:
                                    node_pai[nodes_positions[
                                        self.count]].set_label(newprop.text)

                                break
        return self.converter_arv_para_lista(p_arvore)
Example #6
0
 def parse_constituents(self):
     self.constituent_tree = list(
         ParentedTree.convert(
             list(Sentence.parser.parse(self.sentence))[0]))
     return self.constituent_tree
def find_pronouns(tree):
    pronouns = []
    for child in tree:
        if type(child) in [unicode, str] and child.lower() in PRONOUNS:
            pronouns.append((child.lower(), None))

        if isinstance(child, ParentedTree):
            pronouns = pronouns + find_pronouns(child)

    return pronouns

total = 0
for file in treebank.fileids():
    stats['name'] = file
    for tree in treebank.parsed_sents(file):
        tree = ParentedTree.convert(tree)
        for pronoun, np_node in find_pronouns(tree):
            if pronoun in gendered:
                stats['gendered'] += 1
            if pronoun in itits:
                stats['itits'] += 1
            stats['total'] += 1
            total += 1
            stats['pct_gendered'] = stats['gendered']/float(stats['total'])
    print file, total


    files.append(stats.copy())
    stats = dict.fromkeys(stats, 0)

Example #8
0
def verbFrames(treebank=ptbS, verbforms=verbforms, verbdict=verbdict):
    treeIndex = 0
    allLabelLeaves = []

    # Trees 0-571 are sentences related to air travel inquiries (formulaic)
    # and in a slightly different tree format
    for tree in ptbS[572:]:
        tr = ParentedTree.convert(tree)
        treeStr = joinLeaves(tr)
        for st in tr.subtrees():
            # Flags for annotating slifting / quotative inversion
            # and expletive it
            vSeen = 0
            itSeen = 0
            stleaf = joinLeaves(st)

            if stleaf in verbforms and st.label(
            )[0] == 'V' and st.label() != 'VP':

                s = dict()
                s['treeIndex'] = treeIndex
                s['s'] = treeStr
                s['parentLab'] = st.parent().label().split("-")[0]
                s['verbLab'] = st.label()
                s['verbLemma'] = verbdict[stleaf]
                # These are all the sisters of the verb
                s['verbSynEnvFull'] = ''
                s['verbSynEnvFullStr'] = ''
                # These are the sisters of the verbs, after eliminating what looks like adjuncts or corrections
                # Therefore: likely to be subcategorization frames
                s['verbSubcat'] = ''
                s['verbSubcatStr'] = ''
                # If there is a clause: what kind of clause is it? S, SBAR, ...?
                s['sType'] = ''
                # What kind of "head" does the clause have? Is it interrogative, declarative, has a "that" or "if"...?
                s['clauseHead'] = ''
                # What is the label of the highest verb? A proxy for finiteness
                s['embVerbLabel'] = ''
                # What is the highest verb?
                s['embVerb'] = ''

                for d in st.parent():
                    # Update flags
                    if d.label()[0] == "V":
                        vSeen = 1
                    if joinLeaves(d) == "it":
                        itSeen = 1
                    # Generate output
                    s['verbSynEnvFull'] += d.label() + "~~"
                    s['verbSynEnvFullStr'] += joinLeaves(d) + "~~"

                    # Generate an "abbreviated" output without certain adjuncts
                    if ("MNR" not in d.label() and "TMP" not in d.label() and
                            "TPC" not in d.label()  # 'as NP' 'in light of NP'
                            and "PRP" not in d.label()
                            and "ADV" not in d.label()
                            and "LOC" not in d.label()
                            and "EDITED" not in d.label()
                            and "$" not in d.label() and "NAC" not in d.label(
                            )  # asides and hedges like "but not very much"
                            and "CC" not in d.label()  # 'but/and'
                            and "INTJ" not in d.label()  # 'God!'
                            and "PRN" not in d.label()  # 'you know'
                            and "SEZ" not in d.label()  # 'you know'
                            and "RB"
                            not in d.label()  # '... *well* convinced that..
                            and d.label() != "-DFL"  # uh, you know, ...
                            and d.label() not in [",", ":", "''", "."]):
                        if d.label()[:2] == 'VB':
                            s['verbSubcat'] += 'VB' + "~~"
                            s['verbSubcatStr'] += joinLeaves(d) + "~~"

                        elif d.label()[:2] == 'NP':
                            s['verbSubcat'] += 'NP' + "~~"
                            s['verbSubcatStr'] += joinLeaves(d) + "~~"

                        elif d.label()[0] == "S":
                            # Add flags
                            s['verbSubcat'] += d.label().split('-')[0] + "~~"
                            s['verbSubcatStr'] += joinLeaves(d) + "~~"

                            if ((joinLeaves(d)[:4] == "*T*-" and len(d) == 1)
                                    or
                                (len(d) == 2 and joinLeaves(d[1])[0] == "0"
                                 and joinLeaves(d[1])[:4] == "*T*-")):
                                # Slifting
                                s['verbSubcat'] += "sl"
                            if (itSeen == 1):
                                # "it" expletive
                                s['verbSubcat'] += "it"

                            # if S is not a trace or dominating a single lex item
                            if len(d) > 1:

                                stUse = checkConjoined(d, 'S')

                                # Identify the type of clause, first-pass
                                s['clauseHead'] = getClauseHead(stUse)
                                s['sType'] = stUse.label()
                                # Mark the sentence for finiteness -- get a list of verbs and their labels
                                verbLabs, verbStrs = getFinite(stUse, [], [])

                                if len(verbLabs) > 0:
                                    if verbLabs[0] in [
                                            'MD', 'TO', 'BES'
                                    ] or verbLabs[0][0] == 'V':
                                        s['embVerbLabel'] = verbLabs[0]
                                    s['embVerb'] = verbStrs[0]

                        else:
                            s['verbSubcat'] += d.label() + "~~"
                            s['verbSubcatStr'] += joinLeaves(d) + "~~"

                allLabelLeaves.append(s)
        treeIndex += 1
    return allLabelLeaves
Example #9
0
def get_basic_graph(tree, strategy):
    """
    Convert a phrase-structure tree to a basic graph, without the ellipsis edges.
    """

    t = ParentedTree.fromstring(tree)

    if strategy in ["end-extra-node", "start-end-extra-node", "start-end-extra-node-heuristic"]:
        t = remove_extra_nodes(t)
        t = ParentedTree.convert(t)

    graph = []
    tree_positions = {}
    parent_clauses = {}

    start_index = 0
    end_index = 0

    for index, st in enumerate(t.subtrees()):
        tree_positions[st.treeposition()] = index # keep track of indexes & tree positions
        node = {}
        node["id"] = index
        node["children"] = []
        node["parent"] = tree_positions[st.parent().treeposition()] if st.parent() != None else 0
        node["ellipsed_parents"] = []
        const_tag, start_tags, end_tags = split_tag(st.label())
        # assign indexes for start and end tags if they don't have any (heuristic)
        if const_tag == "CL":
            start_index = 0
        if strategy == "start-end-extra-node-heuristic":
#        for tag_i, tag in enumerate(start_tags):
#            if tag == "":
#                start_tags[tag_i] = start_index
#                start_index += 1
#        for tag_i, tag in enumerate(end_tags):
#            if tag == "":
#                end_tags[tag_i] = end_index
#                end_index += 1
            for tag_i, tag in enumerate(start_tags):
                start_tags[tag_i] = start_index
                start_index += 1
            for tag_i, tag in enumerate(end_tags):
                end_tags[tag_i] = end_index
            if len(end_tags) > 0:
                end_index += 1
        node["tag"] = const_tag
        node["start_tags"] = start_tags
        node["end_tags"] = end_tags
        if st.height() == 2:
            node["terminal"] = "yes"
            node["text"] = st.leaves()[0]
        else:
            node["terminal"] = "no"
            node["text"] = ""
        if node["tag"] == "CL":
            for child in st.subtrees():
                parent_clauses[child.treeposition()] = index
        graph.append(node) 

    # keep track of the parent clause for each node
    parent_clauses = {tree_positions[pos]:parent_clauses[pos] for pos in parent_clauses}

    # assign CLX as the parent clause for nodes which don't have a CL parent
    for node in graph:
        if node["id"] in parent_clauses:
            node["parent_clause"] = parent_clauses[node["id"]]
        else:
            node["parent_clause"] = 0

    return graph
Example #10
0
def get_patterns(tree, types):
    """ Types: a set of patterns types to be included in the program. """
    #print(tree)
    if isinstance(tree, str):
        tree = prolog_parse(tree)
        if tree is None:
            return
    tree = ParentedTree.convert(tree)

    # split clauses with "or" into several clauses
    # (select only clauses with proper name)
    clauses = [clause for clause in tree]

    while True:
        for ci, c in enumerate(clauses):
            c1 = split_or(c)
            if len(c1) == 2:
                clauses[ci:ci + 1] = c1
                break
        else:
            break

    # check whether there is a cut in clause
    cuts, cut = [], False
    for clause in clauses:
        cuts.append(cut)
        if has_cut(clause):
            cut = True

    # duplicate clauses: add original and normalized clause
    #clauses = [(clause, "", cuts[i]) for i, clause in enumerate(clauses)] + \
    #          [(normalize(clause), "norm ", cuts[i]) for i, clause in enumerate(clauses)]
    #clauses = [(normalize(clause, full=False), "", cuts[i]) for i, clause in enumerate(clauses)] + \
    #          [(normalize(clause, full=True), "norm ", cuts[i]) for i, clause in enumerate(clauses)]
    #clauses = [(normalize(clause), "", cuts[i]) for i, clause in enumerate(clauses)]

    #clauses = [(clause, "", False) for clause in clauses]
    clauses = [(normalize(clause, full=False), "", cuts[i])
               for i, clause in enumerate(clauses)]

    # get patterns separately for each clause
    for clause, prefix, cut in clauses:
        # collect variable nodes in this clause
        variables = collections.defaultdict(list)
        for node in clause.subtrees():
            if isinstance(node, Tree) and node.label() == 'variable':
                name = node[0].val
                variables[name].append(node)

        if "all" in types or "singleton" in types:
            # yield patterns for singleton variables
            for var, nodes in variables.items():
                if len(nodes) == 1:
                    #yield 'has_singleton', nodes
                    pat = pattern(clause, nodes)
                    if pat:
                        yield prefix + pat, nodes
                        if cut:
                            yield "cut " + prefix + pat, nodes

        if "all" in types or "var_pairs" in types:
            # yield patterns for variable-variable pairs (within a clause)
            for var, nodes in variables.items():
                for selected in combinations(nodes, 2):
                    pat = pattern(clause, selected)
                    if pat:
                        yield prefix + pat, selected
                        if cut:
                            yield "cut " + prefix + pat, selected
        """if "all" in types or "alt_vars" in types:
            # yield patterns for variable-variable + variable-variable
            # pairs/pairs (within a clause)
            combs = []
            for var, nodes in variables.items():
                combs.extend(combinations(nodes, 2))

            for selected in combinations(combs, 2):
                if selected[0][0] == selected[1][0]:
                    continue
                selected = selected[0] + selected[1]
                pat = pattern(clause, selected)
                if pat:
                    yield prefix + pat, selected
                    if cut:
                        yield "cut " + prefix + pat, selected"""

        # yield patterns for variable-literal / literal-literal pairs
        # yield patterns for singleton literals
        # (only within a topmost compound / binop / unop)
        def patterns_with_literals(node):
            if not isinstance(node, Tree):
                return
            if node.label() in {'compound', 'binop', 'unop'}:
                vars = [n for n in node.subtrees() if n.label() == 'variable']
                lits = [n for n in node.subtrees() if n.label() == 'literal']
                names = [
                    n for n in node.leaves() if isinstance(n, Token)
                    and n.type == 'NAME' and n.val == 'nil'
                ]
                lits = lits + names
                for selected in chain(combinations(lits, 1),
                                      combinations(lits, 2),
                                      product(lits, vars)):
                    pat = pattern(clause, selected)
                    if pat:
                        yield prefix + pat, selected
            else:
                for child in node:
                    yield from patterns_with_literals(child)

        if "all" in types or "literal_pairs" in types:
            yield from patterns_with_literals(clause)
        """if "all" in types or "names" in types:
Example #11
0
def generateTree(sentence):
    t = PARSER.raw_parse(sentence)
    tree = None
    for sub in t:
        tree = sub
    return ParentedTree.convert(tree)
Example #12
0
    def extract_constraints_from_sentence(self, sentence):
        self.sentence = sentence  #sentence.split()
        self.tree = self.make_tree_from_sent()
        self.tree = ParentedTree.convert(self.tree)
        #self.time_vec = time_vec

        Constraint = [
            [0] * 3 for i in range(100)
        ]  # TODO initialization of constraints should be improved
        con_id = -1
        conE_id = -1
        s_val = 0
        e_val = 0
        active_con = []
        active_conE = []
        active_conS = []
        list_to_omit = []
        GO_all = []
        Constraints = []
        i = 1
        j = 1
        GO_list = []
        GO_listE = []
        PickPlaceAction = 0
        GroupActionAction = 0
        stop = 0
        then_active = 0
        #self.poses = poses
        listOVCcur = []
        objects_list = []

        #time_vec_poses = self.extract_time_poses()
        #print ('all_poses_time_vec',self.time_vec_poses)
        #print ('time_vec',self.time_vec)

        #todo should be changed for cases where in one sentence are not only grouped actions
        for objects in self.tree.subtrees(filter=lambda t: t.label() == 'AG'):
            print('adding new grouped action')
            print(objects)

        for objects in self.tree.subtrees(filter=lambda t: t.label() == 'GR'):
            print('adding new general rule')
            print(objects)

        #if sentence is describing storage for objects
        for objects in self.tree.subtrees(filter=lambda t: t.label() == 'HS'):
            print('home storage')

        #extracting constraints and actions from the bag file
        for objects in self.tree.subtrees(filter=lambda t: t.label(
        ) == 'O' or t.label() == 'REL' or t.label() == 'STOP'):
            i = i + 1
            objects_list.append(objects)
            print(objects)

            if objects.label() == 'O':
                O_ID = j
                j = j + 1
                print('sentence ID', O_ID)

            # to create nested relations, we keep a liste of active constraints which are not yet finished (active_con)
            if objects.label() == 'REL':
                if 'Then' in objects.leaves() or 'then' in objects.leaves():
                    print('Then')

            if objects.label() == 'STOP':
                stop = 1
                print('stop')

        return objects_list
 def parent_tree(self):
     return ParentedTree.convert(self)
def main():
    answers = open('coref_key.txt', 'r')
    this_correct = 0
    correct = 0
    total = 0
    prev_sentences = deque()
    for file in FILENAMES:
        this_correct = 0
        this_total = 0
        prev_sentences.clear()
        for tree in treebank.parsed_sents(file):


            tree = ParentedTree.convert(tree)

            for pronoun, np_node in find_pronouns(tree):

                # i = 0
                # for t in list(prev_sentences)[-3:]:
                #     t.pretty_print()
                #     print("-"*25)
                #     i = i + 1
                #     if i == 3: break
                proposed = hobbs_to_string(hobbs(np_node, pronoun.lower(), prev_sentences))
                tree.pretty_print()

                actual = answers.readline()

                if  proposed == actual[:-1]:
                    update_pronoun_results(pronoun, 1)
                    correct += 1
                    this_correct += 1

                update_pronoun_results(pronoun, 0)
                total += 1
                this_total += 1

                print "Pronoun: '" + pronoun + "'   Proposed: '" + proposed + "'   Actual: '" + actual + "'"

                if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n"


                print("*"*100)
                print("*"*100)
            prev_sentences.append(tree)
        print("-"*50)
        if this_correct: print file,":\tCorrect:", this_correct, "\tTotal:", this_total, "\tPercentage:", this_correct/float(this_total), "\n"
        if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n"
        print("-"*50)

    print "Male correct:", PRONOUN_RESULTS['male'], "\tMale total:", PRONOUN_RESULTS['male_total'], "\tPercent correct:", PRONOUN_RESULTS['male_pct']
    print "Female correct:", PRONOUN_RESULTS['female'], "\tFemale total:", PRONOUN_RESULTS['female_total'], "\tPercent correct:", PRONOUN_RESULTS['female_pct']
    print "Neutral correct:", PRONOUN_RESULTS['neutral'], "\tNeutral total:", PRONOUN_RESULTS['neutral_total'], "\tPercent correct:", PRONOUN_RESULTS['neutral_pct']
    print "Plural correct:", PRONOUN_RESULTS['they'], "\tPlural total:", PRONOUN_RESULTS['they_total'], "\tPercent correct:", PRONOUN_RESULTS['they_pct']
    print "Reflexive correct:", PRONOUN_RESULTS['reflexive'], "\tReflexive total:", PRONOUN_RESULTS['reflexive_total'], "\tPercent correct:", PRONOUN_RESULTS['reflexive_pct']
    print "Total correct:", correct, "\tTotal:", total, "\tPercent correct:", correct/float(total)