def create_nltk_tree(sentence):
    tree_with_entities = Tree('S', [])
    raw_sentence = [token[0] for token in sentence]
    pos_tagged_sentence = ner_pipeline.part_of_speech_tagging(raw_sentence)

    current_sub_tree = None
    length = len(sentence)
    index = 0

    while index < length:
        ne_tag = sentence[index][1]
        pos_tuple = pos_tagged_sentence[index]
        index += 1

        if ne_tag[0] == 'O':
            if current_sub_tree:
                tree_with_entities.append(current_sub_tree)
                current_sub_tree = None
            tree_with_entities.extend([pos_tuple])

        else:
            if current_sub_tree:
                current_sub_tree.append(pos_tuple)
            else:
                current_sub_tree = Tree(ne_tag[2:], [pos_tuple])

    # print('RESULT')
    # pp(sentence)
    # pp(tree_with_entities)
    return tree_with_entities
Example #2
0
def _muc_read_text(s, top_node):
    # The tokenizer sometimes splits within coref tags.
    def __fix_tokenization(sents):
        for index in range(len(sents)):
            next = 1
            while sents[index].count('<COREF') != sents[index].count('</COREF>'):
                sents[index] += ' '
                sents[index] += sents[index + next]
                sents[index + next] = ''
                next += 1
        sents = filter(None, sents)
        return sents
    if s:
        tree = Tree(top_node, [])        
        if _MUC6_PARA_RE.match(s):
            for para in _MUC6_PARA_RE.findall(s):
                if para and para[0] and para[0].strip():
                    tree.append(Tree('P', []))
                    for sent in _MUC6_SENT_RE.findall(para[0]):
                        words = _MUC6_SENT_RE.match(sent[0]).group('sent').strip()
                        # There are empty sentences <s></s> in the MUC6 corpus.
                        if words:
                            tree[-1].append(_muc_read_words(words, 'S'))                
        elif _MUC7_PARA_RE.match(s):
            for para in _MUC7_PARA_SPLIT_RE.split(s):
                if para and para.strip():
                    tree.append(Tree('P', []))
                    for sent in __fix_tokenization(_SENT_TOKENIZER.tokenize(para)):
                        tree[-1].append(_muc_read_words(sent, 'S'))
        return tree
def sentences_to_tree(paragraph, tree_with_entities=Tree('S', [])):
    skip_count = 0

    for sentence in paragraph:
        for index, token in enumerate(sentence):
            if skip_count > 0:
                skip_count = skip_count - 1
                continue

            if 'annotation' in token:
                annotation = token['annotation']
                if annotation['label'] == 'NAE':
                    logging.info(
                        'nltk_tree_converter.sentences_to_tree: skipping NAE label'
                    )
                    continue

                length = annotation['length']
                sub_tree = Tree(annotation['label'], [token['term']])

                if length > 1:
                    skip_count = length - 1
                    for next_index in range((index + 1), (index + length)):
                        word = sentence[next_index]['term']
                        sub_tree.append(word)

                tree_with_entities.append(sub_tree)

            else:
                tree_with_entities.extend([token['term']])

    return tree_with_entities
Example #4
0
def _muc_read_text(s, top_node):
    # The tokenizer sometimes splits within coref tags.
    def __fix_tokenization(sents):
        for index in range(len(sents)):
            next = 1
            while sents[index].count('<COREF') != sents[index].count(
                    '</COREF>'):
                sents[index] += ' '
                sents[index] += sents[index + next]
                sents[index + next] = ''
                next += 1
        sents = filter(None, sents)
        return sents

    if s:
        tree = Tree(top_node, [])
        if _MUC6_PARA_RE.match(s):
            for para in _MUC6_PARA_RE.findall(s):
                if para and para[0] and para[0].strip():
                    tree.append(Tree('P', []))
                    for sent in _MUC6_SENT_RE.findall(para[0]):
                        words = _MUC6_SENT_RE.match(
                            sent[0]).group('sent').strip()
                        # There are empty sentences <s></s> in the MUC6 corpus.
                        if words:
                            tree[-1].append(_muc_read_words(words, 'S'))
        elif _MUC7_PARA_RE.match(s):
            for para in _MUC7_PARA_SPLIT_RE.split(s):
                if para and para.strip():
                    tree.append(Tree('P', []))
                    for sent in __fix_tokenization(
                            _SENT_TOKENIZER.tokenize(para)):
                        tree[-1].append(_muc_read_words(sent, 'S'))
        return tree
Example #5
0
def pas_to_tree(x):
    if isinstance(x, tuple):  # has children
        node = Tree(x[0], [])
        for child in x[1]:
            childnode = pas_to_tree(child)
            node.append(childnode)
    else:
        node = Tree(x, [])
    return node
Example #6
0
def add_top_to_tree(treebank_file):
    f = open(treebank_file, "r")
    root_set = set([])
    for sentence in f:
        t = Tree.fromstring(sentence, remove_empty_top_bracketing=False)
        top_node = Tree("TOP", [])
        top_node.append(t)
        print NewTree.flat_print(top_node)
    f.close()
Example #7
0
    def _construct_node_from_actions(self,
                                     current_node: Tree,
                                     remaining_actions: List[List[str]],
                                     add_var_function: bool) -> List[List[str]]:
        """
        Given a current node in the logical form tree, and a list of actions in an action sequence,
        this method fills in the children of the current node from the action sequence, then
        returns whatever actions are left.

        For example, we could get a node with type ``c``, and an action sequence that begins with
        ``c -> [<r,c>, r]``.  This method will add two children to the input node, consuming
        actions from the action sequence for nodes of type ``<r,c>`` (and all of its children,
        recursively) and ``r`` (and all of its children, recursively).  This method assumes that
        action sequences are produced `depth-first`, so all actions for the subtree under ``<r,c>``
        appear before actions for the subtree under ``r``.  If there are any actions in the action
        sequence after the ``<r,c>`` and ``r`` subtrees have terminated in leaf nodes, they will be
        returned.
        """
        if not remaining_actions:
            logger.error("No actions left to construct current node: %s", current_node)
            raise ParsingError("Incomplete action sequence")
        left_side, right_side = remaining_actions.pop(0)
        if left_side != current_node.label():
            logger.error("Current node: %s", current_node)
            logger.error("Next action: %s -> %s", left_side, right_side)
            logger.error("Remaining actions were: %s", remaining_actions)
            raise ParsingError("Current node does not match next action")
        if right_side[0] == '[':
            # This is a non-terminal expansion, with more than one child node.
            for child_type in right_side[1:-1].split(', '):
                if child_type.startswith("'lambda"):
                    # We need to special-case the handling of lambda here, because it's handled a
                    # bit weirdly in the action sequence.  This is stripping off the single quotes
                    # around something like `'lambda x'`.
                    child_type = child_type[1:-1]
                child_node = Tree(child_type, [])
                current_node.append(child_node)  # you add a child to an nltk.Tree with `append`
                if not self.is_terminal(child_type):
                    remaining_actions = self._construct_node_from_actions(child_node,
                                                                          remaining_actions,
                                                                          add_var_function)
        elif self.is_terminal(right_side):
            # The current node is a pre-terminal; we'll add a single terminal child.  We need to
            # check first for whether we need to add a (var _) around the terminal node, though.
            if add_var_function and right_side in self._lambda_variables:
                right_side = f"(var {right_side})"
            if add_var_function and right_side == 'var':
                raise ParsingError('add_var_function was true, but action sequence already had var')
            current_node.append(Tree(right_side, []))  # you add a child to an nltk.Tree with `append`
        else:
            # The only way this can happen is if you have a unary non-terminal production rule.
            # That is almost certainly not what you want with this kind of grammar, so we'll crash.
            # If you really do want this, open a PR with a valid use case.
            raise ParsingError(f"Found a unary production rule: {left_side} -> {right_side}. "
                               "Are you sure you want a unary production rule in your grammar?")
        return remaining_actions
Example #8
0
 def _append(self, node: nltk.Tree, children):
     add_to_stack = []
     for child in children:
         if nltk.grammar.is_nonterminal(child):
             new_node = nltk.Tree(child, [])
             node.append(new_node)
             add_to_stack.append(new_node)
         else:
             node.append(child)
     if add_to_stack:
         self.stack.extend(add_to_stack[::-1])
def reduce_nps(sentence):
    """
    take any occurrences of NP trees that contain only one  NP tree and reduce them
    """
    res = Tree('S',[])
    for child in sentence:
        #print child
        if isinstance(child, Tree):
            #print len(child)

            if len(child) == 1:
                res.append(child[0])
                continue
        res.append(child)
    return res
Example #10
0
def reduce_nps(sentence):
    """
    take any occurrences of NP trees that contain only one  NP tree and reduce them
    """
    res = Tree('S', [])
    for child in sentence:
        #print child
        if isinstance(child, Tree):
            #print len(child)

            if len(child) == 1:
                res.append(child[0])
                continue
        res.append(child)
    return res
Example #11
0
    def _construct_node_from_actions(
            self, current_node: Tree,
            remaining_actions: List[List[str]]) -> List[List[str]]:
        """
        Given a current node in the logical form tree, and a list of actions in an action sequence,
        this method fills in the children of the current node from the action sequence, then
        returns whatever actions are left.

        For example, we could get a node with type ``c``, and an action sequence that begins with
        ``c -> [<r,c>, r]``.  This method will add two children to the input node, consuming
        actions from the action sequence for nodes of type ``<r,c>`` (and all of its children,
        recursively) and ``r`` (and all of its children, recursively).  This method assumes that
        action sequences are produced `depth-first`, so all actions for the subtree under ``<r,c>``
        appear before actions for the subtree under ``r``.  If there are any actions in the action
        sequence after the ``<r,c>`` and ``r`` subtrees have terminated in leaf nodes, they will be
        returned.
        """
        if not remaining_actions:
            logger.error("No actions left to construct current node: %s",
                         current_node)
            raise ParsingError("Incomplete action sequence")
        left_side, right_side = remaining_actions.pop(0)
        if left_side != current_node.label():
            logger.error("Current node: %s", current_node)
            logger.error("Next action: %s -> %s", left_side, right_side)
            logger.error("Remaining actions were: %s", remaining_actions)
            raise ParsingError("Current node does not match next action")
        if right_side[0] == '[':
            # This is a non-terminal expansion, with more than one child node.
            for child_type in right_side[1:-1].split(', '):
                child_node = Tree(child_type, [])
                current_node.append(
                    child_node
                )  # you add a child to an nltk.Tree with `append`
                # For now, we assume that all children in a list like this are non-terminals, so we
                # recurse on them.  I'm pretty sure that will always be true for the way our
                # grammar induction works.  We can revisit this later if we need to.
                remaining_actions = self._construct_node_from_actions(
                    child_node, remaining_actions)
        else:
            # The current node is a pre-terminal; we'll add a single terminal child.  By
            # construction, the right-hand side of our production rules are only ever terminal
            # productions or lists of non-terminals.
            current_node.append(
                Tree(right_side,
                     []))  # you add a child to an nltk.Tree with `append`
        return remaining_actions
Example #12
0
    def _construct_node_from_actions(self,
                                     current_node: Tree,
                                     remaining_actions: List[List[str]]) -> List[List[str]]:
        """
        Given a current node in the logical form tree, and a list of actions in an action sequence,
        this method fills in the children of the current node from the action sequence, then
        returns whatever actions are left.

        For example, we could get a node with type ``c``, and an action sequence that begins with
        ``c -> [<r,c>, r]``.  This method will add two children to the input node, consuming
        actions from the action sequence for nodes of type ``<r,c>`` (and all of its children,
        recursively) and ``r`` (and all of its children, recursively).  This method assumes that
        action sequences are produced `depth-first`, so all actions for the subtree under ``<r,c>``
        appear before actions for the subtree under ``r``.  If there are any actions in the action
        sequence after the ``<r,c>`` and ``r`` subtrees have terminated in leaf nodes, they will be
        returned.
        """
        if not remaining_actions:
            logger.error("No actions left to construct current node: %s", current_node)
            raise ParsingError("Incomplete action sequence")
        left_side, right_side = remaining_actions.pop(0)
        if left_side != current_node.label():
            logger.error("Current node: %s", current_node)
            logger.error("Next action: %s -> %s", left_side, right_side)
            logger.error("Remaining actions were: %s", remaining_actions)
            raise ParsingError("Current node does not match next action")
        if right_side[0] == '[':
            # This is a non-terminal expansion, with more than one child node.
            for child_type in right_side[1:-1].split(', '):
                child_node = Tree(child_type, [])
                current_node.append(child_node)  # you add a child to an nltk.Tree with `append`
                # For now, we assume that all children in a list like this are non-terminals, so we
                # recurse on them.  I'm pretty sure that will always be true for the way our
                # grammar induction works.  We can revisit this later if we need to.
                remaining_actions = self._construct_node_from_actions(child_node, remaining_actions)
        else:
            # The current node is a pre-terminal; we'll add a single terminal child.  By
            # construction, the right-hand side of our production rules are only ever terminal
            # productions or lists of non-terminals.
            current_node.append(Tree(right_side, []))  # you add a child to an nltk.Tree with `append`
        return remaining_actions
Example #13
0
    def _construct_node_from_actions(
            self, current_node: Tree, remaining_actions: List[List[str]],
            add_var_function: bool) -> List[List[str]]:
        """
        Given a current node in the logical form tree, and a list of actions in an action sequence,
        this method fills in the children of the current node from the action sequence, then
        returns whatever actions are left.

        For example, we could get a node with type ``c``, and an action sequence that begins with
        ``c -> [<r,c>, r]``.  This method will add two children to the input node, consuming
        actions from the action sequence for nodes of type ``<r,c>`` (and all of its children,
        recursively) and ``r`` (and all of its children, recursively).  This method assumes that
        action sequences are produced `depth-first`, so all actions for the subtree under ``<r,c>``
        appear before actions for the subtree under ``r``.  If there are any actions in the action
        sequence after the ``<r,c>`` and ``r`` subtrees have terminated in leaf nodes, they will be
        returned.
        """
        if not remaining_actions:
            logger.error("No actions left to construct current node: %s",
                         current_node)
            raise ParsingError("Incomplete action sequence")
        left_side, right_side = remaining_actions.pop(0)
        if left_side != current_node.label():
            mismatch = True
            multi_match_mapping = {
                str(key): [str(value) for value in values]
                for key, values in self.get_multi_match_mapping().items()
            }
            current_label = current_node.label()
            if current_label in multi_match_mapping and left_side in multi_match_mapping[
                    current_label]:
                mismatch = False
            if mismatch:
                logger.error("Current node: %s", current_node)
                logger.error("Next action: %s -> %s", left_side, right_side)
                logger.error("Remaining actions were: %s", remaining_actions)
                raise ParsingError("Current node does not match next action")
        if right_side[0] == '[':
            # This is a non-terminal expansion, with more than one child node.
            for child_type in right_side[1:-1].split(', '):
                if child_type.startswith("'lambda"):
                    # We need to special-case the handling of lambda here, because it's handled a
                    # bit weirdly in the action sequence.  This is stripping off the single quotes
                    # around something like `'lambda x'`.
                    child_type = child_type[1:-1]
                child_node = Tree(child_type, [])
                current_node.append(
                    child_node
                )  # you add a child to an nltk.Tree with `append`
                if not self.is_terminal(child_type):
                    remaining_actions = self._construct_node_from_actions(
                        child_node, remaining_actions, add_var_function)
        elif self.is_terminal(right_side):
            # The current node is a pre-terminal; we'll add a single terminal child.  We need to
            # check first for whether we need to add a (var _) around the terminal node, though.
            if add_var_function and right_side in self._lambda_variables:
                right_side = f"(var {right_side})"
            if add_var_function and right_side == 'var':
                raise ParsingError(
                    'add_var_function was true, but action sequence already had var'
                )
            current_node.append(
                Tree(right_side,
                     []))  # you add a child to an nltk.Tree with `append`
        else:
            # The only way this can happen is if you have a unary non-terminal production rule.
            # That is almost certainly not what you want with this kind of grammar, so we'll crash.
            # If you really do want this, open a PR with a valid use case.
            raise ParsingError(
                f"Found a unary production rule: {left_side} -> {right_side}. "
                "Are you sure you want a unary production rule in your grammar?"
            )
        return remaining_actions
Example #14
0
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'), root_label='S', strict=False):
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        #print
        #print word, postag, chunktag
        #print 
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            if isinstance(word, Tree):
                tree.append( Tree(chunktag[2:], [word]) )
            else:
                tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or tree[-1].node != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    if isinstance(word, Tree):
                        tree.append( Tree(chunktag[2:], [word]) )
                    else:
                        tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                if isinstance(word, Tree):
                    tree[-1].append(word)
                else:
                    tree[-1].append((word,postag))
        elif chunktag == 'O':
            if isinstance(word, Tree):
                print "triggered"
                tree.append(word)
            else:
                tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag %r" % chunktag)
    return tree
Example #15
0
    def _maxent_calculation(self):
        TAGGER_PCL = settings.ABS_PATH("maxent_tagger.pcl")
        print "Calculating Precision/Recall using Custom trained MaxEnt, 80/20 dataset split," " ordered by id from DB."
        true_pos_total = 0
        false_pos_total = 0
        correct_total = 0
        _end = "_end_"

        for line, is_valid in self.Model.judged_data.iteritems():
            ngram, article = line.split(",")
            self.article_rel_dict[article][int(is_valid)].add(ngram)

        def make_trie(ngrams):
            """
            Make trie out of set of ngrams
            """
            root = {}
            for ngram in ngrams:
                current_dict = root
                for word in ngram.split():
                    current_dict = current_dict.setdefault(word, {})
                current_dict = current_dict.setdefault(_end, _end)
            return root

        def in_trie(index, sentence_tagged, trie):
            result = []
            while True:
                end = False
                if _end in trie:
                    end = True
                word = sentence_tagged[index][0]
                norm_word = nlp.Stemmer.stem_wordnet(word)
                trie = trie.get(norm_word)
                if trie:
                    result.append((sentence_tagged[index]))
                    index += 1
                else:
                    if not end:
                        result = []
                    break
            return result

        # generating training file
        train = []
        test_sentences_tagged = defaultdict(list)
        print "Generating training/test data..."
        queryset = Article.objects.filter(cluster_id=self.cluster_id).order_by("id")
        queryset_len = len(queryset)

        # train data of the form [[((word1, POS1), tag1), ((word2, POS2), tag2), ... ], sentence2, ...]
        for article_index, article in enumerate(queryset):
            # skip train generation if tagger exists
            if os.path.exists(TAGGER_PCL) and article_index / queryset_len <= 0.8:
                continue
            correct_ngrams_set = self.article_rel_dict[unicode(article)][1]
            identified_correct = set()
            correct_ngrams = make_trie(correct_ngrams_set)
            for sentence in nltk.sent_tokenize(article.text):
                sentence_tagged = nltk.pos_tag(nltk.regexp_tokenize(sentence, nlp.Stemmer.TOKENIZE_REGEXP))
                sent_tree = Tree("S", [])
                # identify ngrams in the sentence
                i = 0
                while i < len(sentence_tagged):
                    result = in_trie(i, sentence_tagged, correct_ngrams)
                    if result:
                        sent_tree.append(Tree("CON", result))
                        identified_correct.add(nlp.Stemmer.stem_wordnet(" ".join(zip(*result)[0])))
                        i += len(result)
                    else:
                        sent_tree.append(sentence_tagged[i])
                        i += 1
                if article_index / queryset_len <= 0.8:
                    train.append(sent_tree)
                else:
                    test_sentences_tagged[unicode(article)].append(sentence_tagged)
            diff = correct_ngrams_set.difference(identified_correct)
            if diff:
                # TODO: list of correct n-gram that we did not find for some reason
                # ideally should be empty
                print diff
                print article
                print

        print "Finished data generation"

        if os.path.exists(TAGGER_PCL):
            print "Pickled tagger exists. Reading it..."
            tagger = pickle.load(open(TAGGER_PCL, "r"))
        else:
            print "Training tagger on 80% of data..."
            tagger = NEChunkParser(train)
            print "Finished training tagger"
            print "Pickling tagger for later use..."
            pickle.dump(tagger, open(TAGGER_PCL, "wb"))

        print "Calculating precision..."
        for article, sentences in test_sentences_tagged.iteritems():
            print article
            results = [tagger.parse(sentence) for sentence in sentences]
            ne_set = set()
            for result in results:
                for tree in result.subtrees():
                    if tree.node != "S" and len(tree) > 1:
                        ne_set.add(nlp.Stemmer.stem_wordnet(" ".join(zip(*tree)[0]).lower()))
            correct_objects = self.article_rel_dict[unicode(article)][1]
            incorrect_objects = self.article_rel_dict[unicode(article)][0]
            true_pos = [x for x in ne_set if x in correct_objects]
            false_pos = [x for x in ne_set if x in incorrect_objects]
            true_pos_total += len(true_pos)
            false_pos_total += len(false_pos)
            correct_total += len(correct_objects)

            unjudged_objects = [x for x in ne_set if x not in incorrect_objects and x not in correct_objects]
            print "WARN: Unjudged objects:", unjudged_objects

        precision = true_pos_total / (true_pos_total + false_pos_total)
        recall = true_pos_total / correct_total
        print "Precision: ", precision
        print "Recall", recall
        print "F1 measure", 2 * (precision * recall) / (precision + recall)
Example #16
0
def conlltags2tree(sentence,
                   chunk_types=('NP', 'PP', 'VP'),
                   root_label='S',
                   strict=False):
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        #print
        #print word, postag, chunktag
        #print
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word, postag))
        elif chunktag.startswith('B-'):
            if isinstance(word, Tree):
                tree.append(Tree(chunktag[2:], [word]))
            else:
                tree.append(Tree(chunktag[2:], [(word, postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree) == 0 or not isinstance(tree[-1], Tree)
                    or tree[-1].node != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    if isinstance(word, Tree):
                        tree.append(Tree(chunktag[2:], [word]))
                    else:
                        tree.append(Tree(chunktag[2:], [(word, postag)]))
            else:
                if isinstance(word, Tree):
                    tree[-1].append(word)
                else:
                    tree[-1].append((word, postag))
        elif chunktag == 'O':
            if isinstance(word, Tree):
                print "triggered"
                tree.append(word)
            else:
                tree.append((word, postag))
        else:
            raise ValueError("Bad conll tag %r" % chunktag)
    return tree