Example #1
0
def get_features(ptree: nltk.ParentedTree, dtree: List[DepRel], indices, sense, offset):
    features_sentence = []
    for i, drel in enumerate(dtree):
        if drel.rel.lower() == 'root':
            mv_position = i
            break
    else:
        mv_position = 0
    main_verb = lemmatizer.lemmatize(ptree.pos()[mv_position][0])

    for i, (word, tag) in enumerate(ptree.pos()):
        tree_pos = ptree.treeposition_spanning_leaves(i, i + 1)[:-2]
        chain = [ptree[tree_pos[:i + 1]].label() for i in range(len(tree_pos))]
        chain = ['S' if c == 'SBAR' else c for c in chain]
        if len(chain) > 0:
            chain = "-".join(get_compressed_chain(chain))
        stem = stemmer.stem(word).lower()

        features_word = {
            'idx': offset + i,
            'BOS': i == 0,
            'word': word.lower(),
            'pos': tag,
            'lemma': lemmatizer.lemmatize(word).lower(),
            'stem': stem.lower(),
            'chain': chain,
            'conn': sense.split('.')[0] if offset + i in indices else "",
            'inflection': word[len(stem):],
            'is_main_verb': i == mv_position,
            'main_verb': main_verb.lower()
        }
        features_sentence.append(features_word)
    return features_sentence
Example #2
0
def get_features(ptree: nltk.ParentedTree, conn_idxs):
    leave_list = ptree.leaves()
    lca_loc = ptree.treeposition_spanning_leaves(conn_idxs[0], conn_idxs[-1] + 1)[:-1]

    self_category = ptree[lca_loc].label()
    parent_category = ptree[lca_loc].parent().label() if lca_loc else self_category

    left_sibling = get_sibling_label(ptree[lca_loc], 'left')
    right_sibling = get_sibling_label(ptree[lca_loc], 'right')

    labels = {n.label() for n in ptree.subtrees(lambda t: t.height() > 2)}
    bool_vp = 'VP' in labels
    bool_trace = 'T' in labels

    c = ' '.join(leave_list[conn_idxs[0]:conn_idxs[-1] + 1]).lower()
    prev, prev_conn, prev_pos, prev_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, -1)
    next, next_conn, next_pos, next_pos_conn_pos = get_pos_features(ptree, conn_idxs, c, 1)
    prev = lemmatizer.lemmatize(prev)
    next = lemmatizer.lemmatize(next)

    r2l = [ptree[lca_loc[:i + 1]].label() for i in range(len(lca_loc))]
    r2lcomp = get_compressed_chain(r2l)

    feat = {'connective': c, 'connectivePOS': self_category,
            'prevWord': prev, 'prevPOSTag': prev_conn, 'prevPOS+cPOS': prev_pos_conn_pos,
            'nextWord': next, 'nextPOSTag': next_pos, 'cPOS+nextPOS': next_pos_conn_pos,
            'root2LeafCompressed': ','.join(r2lcomp), 'root2Leaf': ','.join(r2l),
            'left_sibling': left_sibling, 'right_sibling': right_sibling,
            'parentCategory': parent_category, 'boolVP': bool_vp, 'boolTrace': bool_trace}

    return feat
    def match(self, tree):
        try:
            if tree.label() != 'ROOT':
                raise IndexError
            if tree[0].label() != 'SBARQ':
                raise IndexError
            if tree[0][0][0].label() != 'WRB':
                raise IndexError
            if tree[0][0][0][0].lower() != 'when':
                raise IndexError
            if tree[0][1].label() != 'SQ':
                raise IndexError
            if tree[0][1][0].label() != 'VBD':
                raise IndexError
            if tree[0][1][1].label() != 'NP':
                raise IndexError
            if tree[0][1][2].label() != 'VP':
                raise IndexError

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(tree[0][1][1]))
            part.property = ParentedTree.fromstring(str(Tree('VP', [
                Tree.fromstring(str(tree[0][0][0])),
                Tree.fromstring(str(tree[0][1][0])),
                Tree.fromstring(str(tree[0][1][2]))
            ])))

            return [part]
        except IndexError:
            return []
    def walker(self, parent):
        if parent.label() == 'IN' and parent.leaves() == ["of"]:
            pos = parent.parent().treeposition()

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(parent.right_sibling()))
            part.property = ParentedTree.fromstring(str(self.get_query_tree()))
            part.property[pos[:-1]].remove(part.property[pos])
            self._parts.append(part)

        for child in parent:
            if isinstance(child, ParentedTree):
                self.walker(child)
Example #5
0
    def walker(self, parent):
        if parent.label() == 'IN' and parent.leaves() == ["of"]:
            pos = parent.parent().treeposition()

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(parent.right_sibling()))
            part.property = ParentedTree.fromstring(str(self.get_query_tree()))
            part.property[pos[:-1]].remove(part.property[pos])
            self._parts.append(part)

        for child in parent:
            if isinstance(child, ParentedTree):
                self.walker(child)
Example #6
0
    def prune_tree(cls, tree, begin_index, end_index):
        """
        Prune the tree that include the begin_index and the end_index
        so that it doesn't include leaves outside of the range limited
        by begin_index and end_index
        """

        begin_path = tree.leaf_treeposition(begin_index)
        end_path = tree.leaf_treeposition(end_index)

        current_node = tree[begin_path[:-1]]
        end_node = tree[end_path[:-1]]

        new_tree = ParentedTree('(' + tree.node + ')')
        ## Initialize new tree
        l = []
        current_new = new_tree
        current_old = tree
        for i in xrange(len(begin_path) - 1):
            if type(current_old[begin_path[i]]) != str:
                current_new.insert(
                    0,
                    ParentedTree('(' + current_old[begin_path[i]].node + ')'))
                current_new = current_new[0]
                current_old = current_old[begin_path[i]]

        while current_old != end_node:
            if not (type(current_old[0]) == str
                    or type(current_old[0]) == unicode):
                current_old = current_old[0]
                current_new.insert(0,
                                   ParentedTree('(' + current_old.node + ')'))
                current_new = current_new[0]
            else:
                current_new.insert(0, current_old[0])
                while len(current_old.parent()
                          ) == current_old.parent_index() + 1:
                    current_old = current_old.parent()
                    current_new = current_new.parent()

                current_old = current_old.parent()[current_old.parent_index() +
                                                   1]
                current_new.parent().insert(
                    current_new.parent_index() + 1,
                    ParentedTree('(' + current_old.node + ')'))

                current_new = current_new.parent()[current_new.parent_index() +
                                                   1]
        current_new.insert(0, current_old[0])
        #         print current_new
        return new_tree
Example #7
0
    def __init__(self,
                 id_sentence,
                 basic_dependencies=None,
                 collapsed_dependencies=None,
                 parsetree='',
                 text=''):
        from nltk import ParentedTree

        assert type(id_sentence) == int, 'Wrong id type'
        assert basic_dependencies is None or \
            type(basic_dependencies) == list, 'Basic dependencies type'
        assert collapsed_dependencies is None or \
            type(collapsed_dependencies) == list, 'Collapsed dependencies type'
        if text:
            assert type(text) == list, 'Wrong text type'

        self.id_sentence = id_sentence
        self.basic_dependencies = DependencyGraph(basic_dependencies)
        self.collapsed_dependencies = DependencyGraph(collapsed_dependencies)
        self._parsetree = parsetree
        self.parsetree = ParentedTree(parsetree)
        self.words = []
        self.next = None
        self.previous = None
        self.coreference_mentions = []
        self.coreference_representatives = []
        self._connected_sentences = None
Example #8
0
def extract_independent_clauses(input_sent, predictor):
    output = predictor.predict(sentence=input_sent)
    tree_str = output["trees"]
    t = ParentedTree.fromstring(tree_str)
    candidate_nodes = list(t.subtrees(filter=lambda x: filt_r(x) or filt_l(x)))
    for node in candidate_nodes:
        if node.parent() in candidate_nodes:
            candidate_nodes.remove(node.parent())
    sub_sentences = []
    for candidate in candidate_nodes:
        temp = []
        for subtree in candidate:
            temp += subtree.leaves()
        sub_sentences.append(temp)
    sub_sentences = sub_sentences if sub_sentences else [t.leaves()]
    sentences = []
    for sentence in sub_sentences:
        temp = ""
        for i, word in enumerate(sentence):
            if i == 0:
                temp += word[0].title() + word[1:]
            elif word in [".", "!", "?", ",", ";"]:
                temp += word
            else:
                temp += " " + word
        temp = temp.replace(" ’", "’")
        temp = temp.replace(" n’", "n’")
        sentences.append(temp)
    return sentences
Example #9
0
def simplify_tree(ptree: nltk.ParentedTree, collapse_root=False):
    ptree._label = 'S'
    tree = nltk.Tree.convert(ptree)

    if not collapse_root and isinstance(tree, nltk.Tree) and len(tree) == 1:
        nodes = [tree[0]]
    else:
        nodes = [tree]

    # depth-first traversal of tree
    while nodes:
        node = nodes.pop()
        if isinstance(node, nltk.Tree):
            if (len(node) == 1 and isinstance(node[0], nltk.Tree)
                    and isinstance(node[0, 0], nltk.Tree)):
                if node.label() != node[0].label():
                    node.set_label(node.label() + '+' + node[0].label())
                else:
                    node.set_label(node.label())
                node[0:] = [child for child in node[0]]
                # since we assigned the child's children to the current node,
                # evaluate the current node again
                nodes.append(node)
            else:
                for child in node:
                    nodes.append(child)

    return nltk.ParentedTree.convert(tree)
Example #10
0
def process_sentence(word_array, results):
    txt = ' '.join(word_array).replace('\n', '')
    tree = next(parser.raw_parse(txt))
    tree = ParentedTree.convert(tree)
    leaf_values = tree.leaves()

    if len(leaf_values) != len(word_array):
        print('This may not happen')

    token_count = 0
    for token in leaf_values:
        token_count += 1
        leaf_index = leaf_values.index(token)
        tree_location = tree.leaf_treeposition(leaf_index)
        depth = len(tree_location)
        parent = tree[tree_location[0:(depth - 1)]]
        trace, POS_stanf = compute_total_trace(parent)
        df = pd.DataFrame([{
            'trace': trace,
            'POS_stanf': POS_stanf,
            'cd_idx': story,
            'sentence_count': sentence,
            'token_count': token_count,
            'token': token
        }])
        results = results.append(df, ignore_index=True)
    return str(tree), results
Example #11
0
def getSentParses(sentence):

    if type(sentence) != str or len(sentence.split()) <= 1: return []

    #Convert sentence into Stanford-parsed tree
    sentence = ParentedTree.convert(list(parser.raw_parse(sentence))[0])

    #Split sentences if they contain multiple full sentences separated by ';', etc.
    sentences = []
    if (sentence[0].label() == 'S') and (sentence[0,0].label() == 'S'):
        for i in range(len(sentence[0])):
            sentences += [sentence[0,i]]
    else:
        for i in range(len(sentence)):
            sentences += [sentence[i]]

    #Obtain desired tuple relations
    parsedSents = []
    for sentence in sentences:
        print "Current subsentence", sentence.leaves()
        parsedSents += [getPrepParse(sentence)]
        parsedSents += [getSVBroadParse(sentence)]

    #Basic stupid coreferencing
    defaultSet = False
    for parsedSent in parsedSents:
        if len(parsedSent) == 0: continue
        if parsedSent[1].label() == 'NP' and parsedSent[1][0].label() != 'PRP':
            default = parsedSent[1]
            defaultSet = True
        if parsedSent[1].label() == 'NP' and parsedSent[1][0].label() == 'PRP' and defaultSet:
            parsedSent[1] = default

    return parsedSents
Example #12
0
    def match(self, *args, **kwargs):
        Pattern.match(self, *args, **kwargs)
        try:
            if self.get_query_tree().label() != "ROOT":
                raise IndexError

            if self.get_query_tree()[0].label() != "SBARQ":
                raise IndexError

            if self.get_query_tree()[0][0].label() != "WHNP":
                raise IndexError

            if self.get_query_tree()[0][0][0].label() != "WP":
                raise IndexError

            if self.get_query_tree()[0][0][0][0].lower() != self._keyword:
                raise IndexError

            if self.get_query_tree()[0][1].label() != "SQ":
                raise IndexError

            if len(self.get_query_tree()[0][1]) < 2:
                raise IndexError

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(str(self.get_query_tree()[0][1][1]))
            self._parts.append(part)

            return self._parts

        except IndexError:
            return []
Example #13
0
    def run(self, args):
        input_text = args["input"]["text"][0]
        input_parse = args["input"]["parse"][0]
        output_parse = args["output"]["parse"][0]
        doc_list = args["doc_list"]
        tuples = self.get_io_files([input_text, input_parse, output_parse], doc_list)

        for files in tuples:
            indexed_parses = []
            in_text_file, in_parse_file, out_parse_file = files

            text = self.read_file(in_text_file)

            parses = self.read_file(in_parse_file)
            parses = json.loads(parses)

            leafreader = LeafReader(text)

            for parse in parses:
                tree = ParentedTree.fromstring(parse, read_leaf=leafreader.read_leaf)
                indexed_parses.append(tree.pprint(margin=float("inf")))

            # in-json parses
            output = json.dumps(indexed_parses)

            self.write_file(output, out_parse_file)
Example #14
0
    def match(self, *args, **kwargs):
        Pattern.match(self, *args, **kwargs)
        try:
            if self.get_query_tree().label() != "ROOT":
                raise IndexError

            if self.get_query_tree()[0].label() != "SBARQ":
                raise IndexError

            if self.get_query_tree()[0][0].label() != "WHNP":
                raise IndexError

            if self.get_query_tree()[0][0][0].label() != "WP":
                raise IndexError

            if self.get_query_tree()[0][0][0][0].lower() != self._keyword:
                raise IndexError

            if self.get_query_tree()[0][1].label() != "SQ":
                raise IndexError

            if len(self.get_query_tree()[0][1]) < 2:
                raise IndexError

            part = Pattern.Part()
            part.object = ParentedTree.fromstring(
                str(self.get_query_tree()[0][1][1]))
            self._parts.append(part)

            return self._parts

        except IndexError:
            return []
Example #15
0
 def ptph(self, rel):
     ptree = ParentedTree.convert(rel.parse_tree)
     # print(ptree.pprint())
     arg1_tokens = rel.get_arg1_tokens()
     arg1_words = self.get_words(arg1_tokens)
     arg2_tokens = rel.get_arg2_tokens()
     arg2_words = self.get_words(arg2_tokens)
     return "ptp={0}".format(self.find_path(ptree, arg1_words, arg2_words))
Example #16
0
 def view(self, s):
     s = re.sub(r"set\(\[([\d, ]*)\]\)", r"{\g<1>}", s)
     print s
     #tree = ParentedTree.parse(s, node_pattern=r"\w*?\[.*?\]", parse_node=buildfeatstruct)
     tree = ParentedTree.parse(s,
                               node_pattern=r"\w*?\[.*?\]",
                               parse_node=FeatStruct)
     tree.draw()
def replace_terminals_with_indices(treestring):
    ''' Replaces each terminal in the tree read from a string with an index in the sentence '''
    tree = ParentedTree.fromstring(treestring)
    for idx, _ in enumerate(tree.leaves()):
        tree_location = tree.leaf_treeposition(idx)
        non_terminal = tree[tree_location[:-1]]
        non_terminal[0] = str(idx)
    return tree
Example #18
0
def sentence_to_tree(sentence):
    """
    Given a sentence (as a text), it will transform it to a tree.

    Args:
        sentence: text of a sentence
    Return:
        sentence tree
    """
    assert isinstance(sentence, basestring)

    sentence = pos_tag(word_tokenize(sentence))
    tree = ParentedTree('S', [])
    for token in sentence:
        word, pos = token
        tree.append(ParentedTree(pos, [word]))
    return tree
Example #19
0
    def gerar_no(self, s):
        '''Gera um ParentedTree do NLTK apartir da string recebida.
		'''
        all_ptrees = []
        t_string = '(' + s[1] + ' ' + s[0] + ')'
        ptree = ParentedTree.convert(Tree.fromstring(t_string))
        all_ptrees.extend(t for t in ptree.subtrees() if isinstance(t, Tree))
        return ptree
def compute_gender(attributes):
    """ Compute the gender of a mention.

    Args:
        attributes (dict(str, object)): Attributes of the mention, must contain
            values for "type", "head", "head_index" and, if the mention is a
            pronoun, "citation_form".

    Returns:
        str: the number of the mention -- one of UNKNOWN, MALE, FEMALE,
            NEUTRAL and PLURAL.
    """
    gender = "NEUTRAL"
    head_index = attributes["head_index"]
    gender_data = external_data.GenderData.get_instance()

    if attributes["head"] != [] and type(attributes["head"][0]) == type(
            ParentedTree('DT', ['a'])):
        attributes["head"] = []
        for i in itertools.chain.from_iterable(attributes["head"]):
            attributes["head"].append(i.leaves())
    if compute_number(attributes) == "PLURAL":
        gender = "PLURAL"
    elif attributes["type"] == "PRO":
        if attributes["citation_form"] == "he":
            gender = "MALE"
        elif attributes["citation_form"] == "she":
            gender = "FEMALE"
        elif attributes["citation_form"] == "it":
            gender = "NEUTRAL"
        elif attributes["citation_form"] in ["you", "we", "they"]:
            gender = "PLURAL"
    elif attributes["type"] == "NAM":
        if re.match(r"^mr(\.)?$", attributes["tokens"][0].lower()):
            gender = "MALE"
        elif re.match(r"^(miss|ms|mrs)(\.)?$",
                      attributes["tokens"][0].lower()):
            gender = "FEMALE"
        elif not re.match(r"(PERSON|NONE)", attributes["ner"][head_index]):
            gender = "NEUTRAL"
        elif gender_data.look_up(attributes):
            gender = gender_data.look_up(attributes)
    elif attributes["type"] == "NOM":
        # print(attributes["head"][0])
        # print(type(attributes["head"][0]))
        # print(attributes["head"] == [] or type(attributes["head"][0]) != type(u'qwe'))
        if attributes["head"] == [] or type(
                attributes["head"][0]) != type(u'qwe'):
            pass
        elif __wordnet_lookup_gender(" ".join(attributes["head"])):
            gender = __wordnet_lookup_gender(" ".join(attributes["head"]))
        elif gender_data.look_up(attributes):
            gender = gender_data.look_up(attributes)

    if gender == "NEUTRAL" and compute_semantic_class(attributes) == "PERSON":
        gender = "UNKNOWN"

    return gender
Example #21
0
def generate(sent, synt, tmpls, synpg_model, pg_model, args):
    with torch.no_grad():

        # convert syntax to tag sequence
        tagss = np.zeros((len(tmpls), args.max_sent_len), dtype=np.long)
        tags_ = ParentedTree.fromstring(synt)
        tags_ = getleaf(tags_)
        tags_ = [
            dictionary.word2idx[f"<{w}>"] for w in tags_
            if f"<{w}>" in dictionary.word2idx
        ]
        tagss[:, :len(tags_)] = tags_[:args.max_sent_len]

        tagss = torch.from_numpy(tagss).cuda()

        # generate parses from tag sequence and templates
        parse_idxs = pg_model.generate(tagss,
                                       tmpls,
                                       args.max_synt_len,
                                       temp=args.temp)

        # add <sos> and remove tokens after <eos>
        synts = np.zeros((len(tmpls), args.max_synt_len + 2), dtype=np.long)
        synts[:, 0] = 1

        for i in range((len(tmpls))):
            parse_idx = parse_idxs[i].cpu().numpy()
            eos_pos = np.where(parse_idx == dictionary.word2idx["<eos>"])[0]
            eos_pos = eos_pos[0] + 1 if len(eos_pos) > 0 else len(idx)
            synts[i, 1:eos_pos + 1] = parse_idx[:eos_pos]

        synts = torch.from_numpy(synts).cuda()

        # bpe segment and convert sentence to tensor
        sents = np.zeros((len(tmpls), args.max_sent_len), dtype=np.long)
        sent_ = bpe.segment(sent).split()
        sent_ = [
            dictionary.word2idx[w]
            if w in dictionary.word2idx else dictionary.word2idx["<unk>"]
            for w in sent_
        ]
        sents[:, :len(sent_)] = sent_[:args.max_sent_len]
        sents = torch.from_numpy(sents).cuda()

        # generate paraphrases from sentence and generated parses
        output_idxs = synpg_model.generate(sents,
                                           synts,
                                           args.max_sent_len,
                                           temp=args.temp)
        output_idxs = output_idxs.cpu().numpy()

        paraphrases = [
            reverse_bpe(synt2str(output_idxs[i], dictionary).split())
            for i in range(len(tmpls))
        ]

        return paraphrases
Example #22
0
def generate(model, data, loader, dictionary, bpe, args):
    model.eval()
    with open(os.path.join(args.output_dir, f"target_sents.txt"), "w") as fp1, \
         open(os.path.join(args.output_dir, f"target_synts.txt"), "w") as fp2, \
         open(os.path.join(args.output_dir, f"outputs.txt"), "w") as fp3:
        with torch.no_grad():
            iterator = tqdm(loader, total=len(loader))
            for it, data_idxs in enumerate(iterator):
                data_idxs = data_idxs.numpy()

                sents_ = data[0][data_idxs]  # sents1
                targs_ = data[1][data_idxs]  # sents2
                synts_ = data[3][data_idxs]  # synts2

                batch_size = len(sents_)
                sents = np.zeros((batch_size, args.max_sent_len),
                                 dtype=np.long)
                synts = np.zeros((batch_size, args.max_synt_len + 2),
                                 dtype=np.long)

                for i in range(batch_size):
                    sent_ = sents_[i]
                    sent_ = bpe.segment(sent_).split()
                    sent_ = [
                        dictionary.word2idx[w] if w in dictionary.word2idx else
                        dictionary.word2idx["<unk>"] for w in sent_
                    ]
                    sents[i, :len(sent_)] = sent_

                    synt_ = synts_[i]
                    synt_ = ParentedTree.fromstring(synt_)
                    synt_ = deleaf(synt_)
                    synt_ = [
                        dictionary.word2idx[f"<{w}>"] for w in synt_
                        if f"<{w}>" in dictionary.word2idx
                    ]
                    synt_ = [dictionary.word2idx["<sos>"]
                             ] + synt_ + [dictionary.word2idx["<eos>"]]
                    synts[i, :len(synt_)] = synt_

                sents = torch.from_numpy(sents).cuda()
                synts = torch.from_numpy(synts).cuda()

                idxs = model.generate(sents,
                                      synts,
                                      sents.size(1),
                                      sample=args.sample,
                                      temp=args.temp)

                for sent, idx, targ, synt_ in zip(sents_,
                                                  idxs.cpu().numpy(), targs_,
                                                  synts_):
                    fp1.write(targ + '\n')
                    fp2.write(synt_ + '\n')
                    fp3.write(
                        reverse_bpe(synt2str(idx, dictionary).split()) + '\n')
Example #23
0
def evaluate(model, data, loader, criterion, dictionary, bpe, args):
    model.eval()
    total_loss = 0.0
    max_it = len(loader)
    with torch.no_grad():
        for it, data_idxs in enumerate(loader):
            data_idxs = np.sort(data_idxs.numpy())
            
            # get batch of raw sentences and raw syntax
            sents_ = data[0][data_idxs]
            synts_ = data[1][data_idxs]

            batch_size = len(sents_)
            
            # initialize tensors
            sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long)    # words without position
            synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long)  # syntax
            targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long)  # target output

            for i in range(batch_size):
                
                # bpe segment and convert to tensor
                sent_ = sents_[i]
                sent_ = bpe.segment(sent_).split()
                sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
                sents[i, :len(sent_)] = sent_
                
                # add <sos> and <eos> for target output
                targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
                targs[i, :len(targ_)] = targ_
                
                # parse syntax and convert to tensor
                synt_ = synts_[i]
                synt_ = ParentedTree.fromstring(synt_)
                synt_ = deleaf(synt_)
                synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
                synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]]
                synts[i, :len(synt_)] = synt_

            sents = torch.from_numpy(sents).cuda()
            synts = torch.from_numpy(synts).cuda()
            targs = torch.from_numpy(targs).cuda()
            
            # forward
            outputs = model(sents, synts, targs)
            
            # calculate loss
            targs_ = targs[:, 1:].contiguous().view(-1)
            outputs_ = outputs.contiguous().view(-1, outputs.size(-1))
            loss = criterion(outputs_, targs_)
        
            total_loss += loss.item()
    
    return total_loss / max_it
Example #24
0
def generate(epoch, eit, model, data, loader, dictionary, bpe, args, max_it=10):
    model.eval()
    with open(os.path.join(args.output_dir, "sents_valid_epoch{:02d}_it{:06d}.txt".format(epoch, eit)), "w") as fp:
        with torch.no_grad():
            for it, data_idxs in enumerate(loader):
                if it >= max_it:
                    break
                
                data_idxs = np.sort(data_idxs.numpy())
                
                # get batch of raw sentences and raw syntax
                sents_ = data[0][data_idxs]
                synts_ = data[1][data_idxs]

                batch_size = len(sents_)
                
                # initialize tensors
                sents = np.zeros((batch_size, args.max_sent_len), dtype=np.long)    # words without position
                synts = np.zeros((batch_size, args.max_synt_len+2), dtype=np.long)  # syntax
                targs = np.zeros((batch_size, args.max_sent_len+2), dtype=np.long)  # target output

                for i in range(batch_size):
                    
                    # bpe segment and convert to tensor
                    sent_ = sents_[i]
                    sent_ = bpe.segment(sent_).split()
                    sent_ = [dictionary.word2idx[w] if w in dictionary.word2idx else dictionary.word2idx["<unk>"] for w in sent_]
                    sents[i, :len(sent_)] = sent_
                    
                    # add <sos> and <eos> for target output
                    targ_ = [dictionary.word2idx["<sos>"]] + sent_ + [dictionary.word2idx["<eos>"]]
                    targs[i, :len(targ_)] = targ_
                    
                    # parse syntax and convert to tensor
                    synt_ = synts_[i]
                    synt_ = ParentedTree.fromstring(synt_)
                    synt_ = deleaf(synt_)
                    synt_ = [dictionary.word2idx[f"<{w}>"] for w in synt_ if f"<{w}>" in dictionary.word2idx]
                    synt_ = [dictionary.word2idx["<sos>"]] + synt_ + [dictionary.word2idx["<eos>"]]
                    synts[i, :len(synt_)] = synt_
            
                sents = torch.from_numpy(sents).cuda()
                synts = torch.from_numpy(synts).cuda()
                targs = torch.from_numpy(targs).cuda()
                
                # generate
                idxs = model.generate(sents, synts, sents.size(1), temp=args.temp)
                
                # write output
                for sent, idx, synt in zip(sents.cpu().numpy(), idxs.cpu().numpy(), synts.cpu().numpy()):
                    fp.write(synt2str(synt[1:], dictionary)+'\n')
                    fp.write(sent2str(sent, dictionary)+'\n')
                    fp.write(synt2str(idx, dictionary)+'\n')
                    fp.write("--\n")
Example #25
0
    def traverse(graph, node):

        children = [int(c) for c in graph[node]["children"]]
        tagged_children = []
        for child in children:
            ellipsed_parents = [
                int(p) for p in graph[child]["ellipsed_parents"]
            ]
            # if the child is explicit
            if node not in ellipsed_parents:
                if graph[child]["terminal"] == "yes":
                    tagged_children.append(
                        ParentedTree(graph[child]["tag"],
                                     [graph[child]["text"]]))
                else:
                    tagged_children.append(traverse(graph, child))

        tree = ParentedTree(graph[node]["tag"], tagged_children)

        return tree
Example #26
0
def process_tree(tree_str, label):
    example = None
    try:
        #print("getting tree")
        tree = ParentedTree.fromstring(tree_str.__str__())
        #print("before get_relation")
        example = get_relation(tree, label)
        #print(example.e1)
    except ValueError as err:
        #print(err)
        pass
    return example
Example #27
0
def get_features(ptree: nltk.ParentedTree, conn_idxs: List[int]):
    features = []
    for i, (word, tag) in enumerate(ptree.pos()):
        features.append({
            'BOS': i == 0,
            'word': word,
            'pos': tag,
            'lemma': lemmatizer.lemmatize(word),
            'stem': stemmer.stem(word),
            'conn': i in conn_idxs
        })
    return features
Example #28
0
 def __init__(self, filename, postagged='./data/postagged-files',
              parsed='./data/parsed-files',
              dependency='./data/dep-files'):
     self.filename = filename
     postagged_file = os.path.join(postagged, filename+'.tag')
     parsed_file = os.path.join(parsed, filename+'.parse')
     dep_file = os.path.join(dependency, filename+'.parse.dep')
     self.tagged_sents = [x.strip() for x in open(postagged_file) if x.strip()]
     self.parsed_sents = [ParentedTree.fromstring(x) for x in open(parsed_file) if x.strip()]
     self.dep_sents = [DepTree.fromstring(x)
                       for x in open(dep_file).read().strip().split('\n\n')
                       if x.strip()]
     assert len(self.tagged_sents) == len(self.parsed_sents)
Example #29
0
def parse2edus(parse):
    """
    将成分句法树切割为 EDU
    :param parse: 成分句法树 Bracket 格式文本, e.g. '( (IP (NP (PN 我)) (VP (VV 爱) (NP (NR 北京) (NR 天安门)))))'
    :return: structure.tree.EDU 生成器
    """
    pipeline = get_pipeline()
    segmenter = pipeline.segmenter
    parse = ParentedTree.fromstring(parse)
    childs = list(parse.subtrees(lambda t: t.height() == 2 and t.label() != '-NONE-'))
    text = ''.join([child[0] for child in childs])
    sentence = Sentence((0, len(text)), text, parse=parse)
    return segmenter.cut_edu(sentence)
Example #30
0
def parse_parented_tree(tree_string):
    """ Construct a tree from a constituent parse tree string.

    Args:
        tree_string (str): A constituent parse tree in bracket notation

    Returns:
        nltk.ParentedTree: A parse tree corresponding to the parse tree string.
    """
    try:
        return ParentedTree(tree_string)
    except TypeError:
        return ParentedTree.fromstring(tree_string)
Example #31
0
def read_file(file_):
    """
    """

    trees = []

    with open(file_, "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        tree = ParentedTree.fromstring(line)
        trees.append(tree)

    return trees
Example #32
0
def norm_negation(node):
    if not isinstance(node, Tree):
        return
    for i, ni in enumerate(node):
        # is it a negation functor?
        if isinstance(ni, ParentedTree) and ni.label() == 'compound' and \
                ni[0].label() == 'functor' and ni[0][0].val in ['\\+','not']:
            # take first argument
            first = ni[1][0]
            if isinstance(first, ParentedTree):
                first._parent = None
            # create a new tree
            ni = node[i] = ParentedTree(
                'unop', [Token('NOT', '\\+', ni[0][0].pos), first])
        norm_negation(ni)
Example #33
0
def template2tensor(templates, max_tmpl_len, dictionary):
    tmpls = np.zeros((len(templates), max_tmpl_len + 2), dtype=np.long)
    for i, tp in enumerate(templates):
        tmpl_ = ParentedTree.fromstring(tp)
        tree2tmpl(tmpl_, 1, 2)
        tmpl_ = str(tmpl_).replace(")", " )").replace("(", "( ").split(" ")
        tmpl_ = [
            dictionary.word2idx[f"<{w}>"] for w in tmpl_
            if f"<{w}>" in dictionary.word2idx
        ]
        tmpl_ = [dictionary.word2idx["<sos>"]
                 ] + tmpl_ + [dictionary.word2idx["<eos>"]]
        tmpls[i, :len(tmpl_)] = tmpl_

    tmpls = torch.from_numpy(tmpls).cuda()

    return tmpls
Example #34
0
def get_features(relation: Relation, ptree: nltk.ParentedTree):
    conn_raw = ' '.join(t.surface for t in relation.conn.tokens)
    conn_idxs = [t.local_idx for t in relation.conn.tokens]

    lca_loc = lca(ptree, conn_idxs)
    conn_tag = ptree[lca_loc].label()

    if conn_idxs[0] == 0:
        prev = "NONE"
    else:
        prev = ptree.leaves()[conn_idxs[0] - 1][0]
        prev = lemmatizer.lemmatize(prev)

    conn_pos_relative = get_connective_sentence_position(conn_idxs, ptree)

    feat = {'Connective': conn_raw,
            'ConnectivePOS': conn_tag,
            'ConnectivePrev': prev, 'connectivePosition': conn_pos_relative}
    return feat
Example #35
0
def terms_inference(sentences, terms_trie):
    """
    Given (tokenized and tagged) sentences and a trie of terms, it will
    infere terms occurences and return list of sentence trees.

    Args:
        sentences: shallow-parsed text
        terms_trie: trie of terms
    Return:
        list of shallow parse trees with inferred terms,
        dictionary of refferences to terms positions
    """
    parsed_sentences = []
    terms_positions = defaultdict(list)
    for sentence in sentences:
        parsed_sentence = ParentedTree('S', [])

        token_index = 0
        while token_index < len(sentence):
            term_label, term_length = _longest_matching_term(
                sentence, token_index, terms_trie)

            if term_length > 0:
                # term found
                term_node = ParentedTree('TERM', [])

                term = name_to_term(term_label)
                term_node.term = term
                terms_positions[term].append(term_node)

                for token in sentence[token_index:token_index + term_length]:
                    _append_word_token(term_node, token)
                parsed_sentence.append(term_node)

                token_index += term_length

            else:
                # there is no term starting from current postion
                token = sentence[token_index]
                _append_word_token(parsed_sentence, token)
                token_index += 1

        parsed_sentences.append(parsed_sentence)

    return parsed_sentences, terms_positions
for single_char_word in SingleCharWord:

  if single_char_word in UpdatedVec:
    tag_set=UpdatedVec[single_char_word]

  else:
    tag_set=Word2Tag[single_char_word]
    print('Fail!')
    break

  tag_str=set2str(tag_set)

  tree_str='( '+tag_str+'_u '+single_char_word+' )' # revers to old version of discarding extra unary rule on Oct. 5 ---
  #tree_str=' (   '+tag_str+'_u '+' ( '+tag_str+'_b '+single_char_word+' ) ) '  #<-------- XXX  Change on Oct. 4------

  tree=ParentedTree(tree_str)

  index=len(NewForest)
  NewForest.append(tree)

  Word2treeID[single_char_word]=index

print('done! Such trees have been appended to NewForest, and word2treeId mapping has been stored in Word2treeID hashtable.')





#--------------------------->>> The following is the part that differ from 4_mini_tree_seq_gen.py  <<<--------------

#
Example #37
0
 def test_read_normal_tree(self):
     leafreader = LeafReader('I have a book.')
     tree = ParentedTree.fromstring('(S1 (S (S (NP (PRP I)) (VP (VBP have) (NP (DT a) (NN book)))) (. .)))',
                            read_leaf=leafreader.read_leaf)
     print(tree.pprint(margin=float("inf")))
Example #38
0
 def test_read_indexed_tree(self):
     leafreader = IndexedLeafReader()
     tree = ParentedTree.fromstring('(S1 (S (S (NP (PRP I|0|1)) (VP (VBP have|2|6) (NP (DT a|3|4) (NN book|9|13)))) (. .|13|14)))',
                                    read_leaf=leafreader.read_leaf)
     print(tree.pprint(margin=float("inf")))
Example #39
0
 def view(self, s):
     s = re.sub(r"set\(\[([\d, ]*)\]\)", r"{\g<1>}", s)
     print s
     #tree = ParentedTree.parse(s, node_pattern=r"\w*?\[.*?\]", parse_node=buildfeatstruct)
     tree = ParentedTree.parse(s, node_pattern=r"\w*?\[.*?\]", parse_node=FeatStruct)
     tree.draw()
Example #40
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
from nltk import ParentedTree

inputfile = sys.argv[1]

parses = open(inputfile, "r").read().split("\n\n")

for parse in parses:
    t = ParentedTree(parse)
    print(t._pprint_flat(nodesep="", parens="()", quotes=False))
Example #41
0
#
# 1st pass processing to  1. update the tree node label to set2str({possible tag associated with the leaves/strings}), i.e. concatenation of sorted possible tags for the string
#                         2. update InduceLeftNode and InduceRightNode hashtables.
#

count=0
print('\n>>1st pass of process the trees')
for tree in Forest:

  count +=1
  if count%int(len(Forest)/10)==0:
      print('progress------->',str(count/len(Forest)*100)[:2], '% finished')


  new_tree=ParentedTree(tree.pprint())


  

  for subtree in new_tree.subtrees():  #update current tree

    string=''.join(subtree.leaves())

    if  string in Vec:  #leaves/string in the record

      tag, subscript= decompose_tag(subtree.node)

      tag_vec_str=set2str(Vec[string]) #get the tag-set of the node according to the leaves and convert it to str

      subtree.node=tag_vec_str+'_'+subscript  #update the node with the new_tag
                   dest="parse",
                   default="data/projectsyndicate/projectsyndicate.truecased.de.20.parse",
                   help="german parses parse persentence")
    opt.add_option("-m", dest="map", default="data/projectsyndicate/de-negra.map")
    opt.add_option("-s", dest="word2unipos", default="data/projectsyndicate/de.pos")
    (options, _) = opt.parse_args()
    word2pos = {}
    f = codecs.open(options.parse, 'r', 'utf-8')
    posmap = {}
    for line in codecs.open(options.map, 'r', 'utf-8').readlines():
        de_pos, uni_pos = line.split()
        posmap[de_pos] = uni_pos

    with f:
        for line in f:
            t = ParentedTree.fromstring(line.strip())
            for pos_token in t.subtrees(lambda t: t.height() == 2):
                pos_token = str(pos_token)[1:-1]
                (pos, token) = pos_token.split()
                token = token.encode('utf-8')
                s = word2pos.get(token, set())
                s.add(posmap[pos])
                word2pos[token] = s
    f.close()
    w = codecs.open(options.word2unipos, 'w', 'utf-8')
    for token, set_pos in word2pos.items():
        print token, ':', ','.join(set_pos)
        w.write(token.encode('utf-8') + '\t' + ','.join(set_pos) + '\n')
    w.flush()
    w.close()
def find_pronouns(tree):
    pronouns = []
    for child in tree:
        if type(child) in [unicode, str] and child.lower() in PRONOUNS:
            pronouns.append((child.lower(), None))

        if isinstance(child, ParentedTree):
            pronouns = pronouns + find_pronouns(child)

    return pronouns

total = 0
for file in treebank.fileids():
    stats['name'] = file
    for tree in treebank.parsed_sents(file):
        tree = ParentedTree.convert(tree)
        for pronoun, np_node in find_pronouns(tree):
            if pronoun in gendered:
                stats['gendered'] += 1
            if pronoun in itits:
                stats['itits'] += 1
            stats['total'] += 1
            total += 1
            stats['pct_gendered'] = stats['gendered']/float(stats['total'])
    print file, total


    files.append(stats.copy())
    stats = dict.fromkeys(stats, 0)

Example #44
0
for single_char_word in SingleCharWord:

  if single_char_word in UpdatedVec:
    tag_set=UpdatedVec[single_char_word]

  else:
    #tag_set=Word2Tag[single_char_word]
    print('Fail!')
    break

  tag_str=set2str(tag_set)

  tree_str='( '+tag_str+'_u '+single_char_word+' )'

  tree=ParentedTree(tree_str)

  index=len(NewForest)
  NewForest.append(tree)

  Word2treeID[single_char_word]=index

print('done! Such trees have been appended to NewForest, and word2treeId mapping has been stored in Word2treeID hashtable.')







for single_char_word in SingleCharWord:

  if single_char_word in UpdatedVec:
    tag_set=UpdatedVec[single_char_word]

  else:
    tag_set=Word2Tag[single_char_word]
    print('Fail!')
    break

  tag_str=set2str(tag_set)

  #tree_str='( '+tag_str+'_b '+single_char_word+' )' # revers to old version of discarding extra unary rule on Oct. 5 ---
  tree_str=' (   '+tag_str+'_l '+' ( '+tag_str+'_b '+single_char_word+' ) ) '  ##<-------- XXX  Change on Oct. 7, only use l/b, and discard 'u' tag------

  tree=ParentedTree(tree_str)

  index=len(NewForest)
  NewForest.append(tree)

  Word2treeID[single_char_word]=index

print('done! Such trees have been appended to NewForest, and word2treeId mapping has been stored in Word2treeID hashtable.')





#--------------------------->>> The following is the part that differ from 4_mini_tree_seq_gen.py  <<<--------------

#
Example #46
0
def getHead(syntac_sen):
	t = ParentedTree(syntac_sen.text)


	target = t[0]

	while target.height() != 2:
		### non-trivial rules: no.1 
		flag = 0
		parent = target
		if target.node == "SBARQ":
			for ts in target:
				if ts.node in ["WHNP", "WHPP", "WHADJP", "WHADVP"] and len(ts) > 1:
					
					target = ts
					flag = 1
					break	
		###
		if not flag:
			rules = head_trace_rule[target.node]
			#rules = head_trace_rule.get(target.node, [])
			for rule in rules:
				if rule[0] == "L":
					newTarget = LookByL(target, rule[1:])
				elif rule[0] == "R":
					newTarget = LookByR(target, rule[1:])
				elif rule[0] == "LBP":
					newTarget = LookByLBP(target, rule[1:])
				elif rule[0] == "RBP":
					newTarget = LookByRBP(target, rule[1:])
				if newTarget != "":
					break
			if newTarget == "":
				target = target[0]
			else:
				target = newTarget
			#print target
			#print target.height()
		
		### non-trivial rules: no.2:
		if flag:
			leafPos = getLeafPOS(target)
			m = re.search(r'(NN|NNS)_(\d+) POS_', leafPos)
			if m != None:
				lvs = target.leaves()
				print m.groups()
				target = ParentedTree("("+m.group(1)+" "+lvs[int(m.group(2))]+")")

		### non-trivial rules: no.3
		
		if target.height() == 2 and target.leaves()[0] in ["name", "kind", "type", "genre", "group", "part"]:
			print parent
			for k in parent:
				if k.node == "PP":
					target = k
					break
			pr = parent.right_sibling()
			for p in pr:
				if pr.node == "PP":
					target = pr
					break
				
	return target.leaves()[0]
Example #47
0
    def match(self, tree):
        if not isinstance(tree, ParentedTree):
            raise AttributeError

        self._query_tree = ParentedTree.fromstring(str(tree))
def main():
    answers = open('coref_key.txt', 'r')
    this_correct = 0
    correct = 0
    total = 0
    prev_sentences = deque()
    for file in FILENAMES:
        this_correct = 0
        this_total = 0
        prev_sentences.clear()
        for tree in treebank.parsed_sents(file):


            tree = ParentedTree.convert(tree)

            for pronoun, np_node in find_pronouns(tree):

                # i = 0
                # for t in list(prev_sentences)[-3:]:
                #     t.pretty_print()
                #     print("-"*25)
                #     i = i + 1
                #     if i == 3: break
                proposed = hobbs_to_string(hobbs(np_node, pronoun.lower(), prev_sentences))
                tree.pretty_print()

                actual = answers.readline()

                if  proposed == actual[:-1]:
                    update_pronoun_results(pronoun, 1)
                    correct += 1
                    this_correct += 1

                update_pronoun_results(pronoun, 0)
                total += 1
                this_total += 1

                print "Pronoun: '" + pronoun + "'   Proposed: '" + proposed + "'   Actual: '" + actual + "'"

                if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n"


                print("*"*100)
                print("*"*100)
            prev_sentences.append(tree)
        print("-"*50)
        if this_correct: print file,":\tCorrect:", this_correct, "\tTotal:", this_total, "\tPercentage:", this_correct/float(this_total), "\n"
        if total: print "Overall:\tCorrect:", correct, "\tTotal:", total, "\tPercentage:", correct/float(total), "\n"
        print("-"*50)

    print "Male correct:", PRONOUN_RESULTS['male'], "\tMale total:", PRONOUN_RESULTS['male_total'], "\tPercent correct:", PRONOUN_RESULTS['male_pct']
    print "Female correct:", PRONOUN_RESULTS['female'], "\tFemale total:", PRONOUN_RESULTS['female_total'], "\tPercent correct:", PRONOUN_RESULTS['female_pct']
    print "Neutral correct:", PRONOUN_RESULTS['neutral'], "\tNeutral total:", PRONOUN_RESULTS['neutral_total'], "\tPercent correct:", PRONOUN_RESULTS['neutral_pct']
    print "Plural correct:", PRONOUN_RESULTS['they'], "\tPlural total:", PRONOUN_RESULTS['they_total'], "\tPercent correct:", PRONOUN_RESULTS['they_pct']
    print "Reflexive correct:", PRONOUN_RESULTS['reflexive'], "\tReflexive total:", PRONOUN_RESULTS['reflexive_total'], "\tPercent correct:", PRONOUN_RESULTS['reflexive_pct']
    print "Total correct:", correct, "\tTotal:", total, "\tPercent correct:", correct/float(total)