Beispiel #1
0
    def identify_emerging_concepts(self,
                                   sent: Span,
                                   section: Section,
                                   graph: ConceptGraph,
                                   rule_based=True):
        """Identify concepts in a given sentence that are likely to be emerging concepts.

        :param sent: A spaCy span representing a sentence in a document.
        :param section: The section the sentence appears in.
        :param graph: The concept graph to record the emerging concepts in.
        :param rule_based: Flag indicating whether or not to use the rule-based classifier.
        """
        if not rule_based:
            return

        for token in filter(lambda token: token.dep_ == 'ROOT', sent):
            concept_tokens = []

            if token.lemma_ == 'be':
                if len(
                        list(
                            filter(lambda right: right.dep_ == 'attr',
                                   token.rights))) > 0:
                    concept_tokens = filter(
                        lambda left: left.dep_.endswith('subj'), token.lefts)
            elif token.lemma_ == 'define':
                try:
                    concept_tokens = list(
                        filter(lambda left: left.dep_.endswith('subjpass'),
                               token.lefts))
                except StopIteration:
                    concept_tokens = list(
                        filter(lambda right: right.dep_ == 'dobj',
                               token.rights))
            elif token.lemma_ == 'call':
                concept_tokens = list(
                    filter(lambda right: right.dep_ == 'oprd', token.rights))

            tokens = []

            for token in concept_tokens:
                tokens += token.subtree

            if len(tokens) > 0:
                if tokens[0].tag_ == 'DT':
                    tokens = tokens[1:]

                tokens = filter(lambda token: len(token.text.strip()) > 0,
                                tokens)
                node = Node(' '.join(map(lambda token: token.text, tokens)))

                if node != '' and zipf_frequency(
                        node, 'en') < self.emerging_concept_frequency_cutoff:
                    if node not in graph.nodes:
                        graph.add_node(node, section)

                    graph.emerging_concepts.add(node)
Beispiel #2
0
    def add_implicit_references(self, pos_tags: List[Tuple[str, str]],
                                section: Section, graph: ConceptGraph):
        """Derive nodes and edges from a POS tagged phrase.

        See `permutations()` for details on what kind of nodes and edges are derived.

        :param pos_tags: A phrase as a list of token, tag pairs.
        :param section: The section that the phrase appears in.
        :param graph: The graph to add the derived nodes and edges to.
        """
        for implicit_entity, context in self.permutations(pos_tags):
            graph.add_node(implicit_entity, section)
            graph.add_edge(context, implicit_entity, ImplicitReference)
def evaluate_random(a_priori_concepts, backward_references, basename,
                    emerging_concepts, filename, forward_references,
                    output_dir, parsers, random_trials):
    for parser in parsers:
        graph = ConceptGraph(parser)
        graph.parse(filename)

        for trial in range(random_trials):
            # Mark concepts
            graph.a_priori_concepts = set()
            graph.emerging_concepts = set()

            for node in graph.nodes:
                if random.uniform(0, 1) < 0.5:
                    graph.a_priori_concepts.add(node)
                else:
                    graph.emerging_concepts.add(node)

            # Redo edges
            edges = graph.edges.copy()

            for edge in edges:
                new_edge = DirectedEdge(edge.tail, edge.head)
                new_edge.style = edge.style
                new_edge.frequency = edge.frequency

                graph.set_edge(new_edge)

            graph.mark_edges()

            # Evaluate graph
            df = evaluate(graph, a_priori_concepts, emerging_concepts,
                          forward_references, backward_references)

            if output_dir:
                path = f'{output_dir}{parser.__class__.__name__}-{basename[0]}-random_{trial + 1}.csv'

                with open(path, 'w') as f:
                    df.to_csv(f)

            print(
                f'\rTrial {trial + 1} of {random_trials} for {parser.__class__.__name__}',
                end='')

        print()
def evaluate_deterministic(a_priori_concepts, backward_references, basename,
                           emerging_concepts, filename, forward_references,
                           output_dir, parsers):
    for parser in parsers:
        graph = ConceptGraph(parser)
        graph.parse(filename)

        df = evaluate(graph, a_priori_concepts, emerging_concepts,
                      forward_references, backward_references)

        if output_dir:
            path = f'{output_dir}{parser.__class__.__name__}-{basename[0]}.csv'

            with open(path, 'w') as f:
                df.to_csv(f)

            print(f'Saved results for {parser.__class__.__name__} to {path}')

        print(f'Results for: {parser.__class__.__name__}')
        print(df)
        print()
Beispiel #5
0
    def add_gerund_phrase(self, subject: Node, section: Section,
                          sentence: Span, graph: ConceptGraph):
        """Add gerund (verb) phrases to the graph.

        For gerunds without an object, just the verb is added to the graph.
        For all other gerunds, the object is added to the graph and the edge between the subject and object is annotated
        with the S-form of the gerund. Form example, 'Tom likes cake.' yields the nodes 'Tom' and 'cake' connected by
        an edge annotated with 'likes'.

        :param subject: The subject of the sentence the gerund was found in.
        :param section: The section the gerund (and its sentence) was found in.
        :param sentence: The sentence the gerund was found in.
        :param graph: The graph to add the gerund phrase to.
        """
        for gerund in filter(lambda token: token.tag_ == 'VBG', sentence):
            verb = Node(gerund.text)

            # TODO: Add edge between gerund and object
            # TODO: Remove redundant edge between subject and object since that relation is represented
            #  through the path subject -> verb -> object.
            # TODO: Refactor verbal phrase stuff such that we instead have
            #  subject -- verb (S-form) --> object
            #  including is_a and has_a relations.
            for right in gerund.rights:
                if 'obj' in right.dep_:
                    if self.annotate_edges:
                        object_ = Node(right.text)
                        graph.add_node(object_, section)

                        the_edge = graph.add_edge(subject, object_)
                        the_edge.label = gerund.lemma_

                        if the_edge.label.endswith(('s', 'sh', 'ch')):
                            the_edge.label += 'es'
                        elif the_edge.label.endswith('y'):
                            the_edge.label = the_edge.label[:-1] + 'ies'
                        else:
                            the_edge.label += 's'

                    break
            else:
                graph.add_node(verb, section)
                graph.add_edge(subject, verb)
Beispiel #6
0
def main(file,
         n_samples=-1,
         disable_implicit_references=False,
         disable_reference_marking=False,
         disable_edge_annotation=False,
         disable_summary=False,
         disable_graph_rendering=False):
    """Run an experiment testing how ordering of sections affects the scoring of conceptual density for a given
    document.

    NOTE: This very slow for documents with any more than 7 sections due to the O(n!) time complexity of checking each
    permutation of section ordering.
    """

    graph = ConceptGraph(parser=XMLParser(not disable_edge_annotation,
                                          not disable_implicit_references),
                         mark_references=not disable_reference_marking)
    graph.parse(file)

    if not disable_summary:
        graph.print_summary()

    print('Original Section Ordering: %s' % graph.sections)
    print('Score on Original Ordering: %.2f' % graph.score())

    if not disable_graph_rendering:
        graph.render()

    scores = []
    permutations = []

    assert n_samples == -1 or n_samples > 0, 'Parameter `n-samples` must be -1 or a positive integer.'

    if n_samples > 0:
        for i in range(n_samples):
            permutation = np.random.permutation(graph.sections)

            evaluate_permutation(graph, i, permutation.tolist(), permutations,
                                 scores)
    else:
        for i, permutation in enumerate(itertools.permutations(
                graph.sections)):
            evaluate_permutation(graph, i, permutation, permutations, scores)

    print('\nDone.')

    min_score = min(scores)
    max_score = max(scores)

    if len(scores) > 10:
        print('scores: %s...' % ['%.2f' % score for score in scores[:10]])
    else:
        print('scores: %s' % ['%.2f' % score for score in scores])

    print('min: %.2f - ordering: %s' %
          (min_score, permutations[scores.index(min_score)]))
    print('max: %.2f - ordering: %s' %
          (max_score, permutations[scores.index(max_score)]))
    print('Mean: %.2f - Std. Dev.: %.2f' %
          (sum(scores) / len(scores), statistics.stdev(scores)))
    print('Max Absolute Difference: %.2f - Max Diff. Ratio: %.2f' %
          (max_score - min_score, (max_score - min_score) / max_score))
Beispiel #7
0
def main(file,
         parser_type='default',
         disable_coreference_resolution=False,
         disable_implicit_references=False,
         disable_edge_annotation=False,
         disable_reference_marking=False,
         disable_summary=False,
         disable_graph_rendering=False,
         debug_mode=False):
    """Parse a text document and produce a score relating to conceptual density."""

    if parser_type == 'openie':
        parser_type = OpenIEParser
    elif parser_type == 'corenlp':
        parser_type = CoreNLPParser
    elif parser_type == 'ensemble':
        parser_type = EnsembleParser
    else:
        if parser_type != 'default':
            warnings.warn(
                'Unrecognised parser type \'%s\' - using default parser.' %
                parser_type)

        parser_type = XMLParser

    parser = parser_type(not disable_edge_annotation,
                         not disable_implicit_references,
                         not disable_coreference_resolution)
    graph = ConceptGraph(parser=parser,
                         mark_references=not disable_reference_marking)

    try:
        start = datetime.now()

        graph.parse(file)

        delta = datetime.now() - start

        print('Document parsed in: %s' % delta)
    except ElementTree.ParseError as e:
        print('Could not parse the file. \n%s.' % e.msg.capitalize(),
              file=sys.stderr)
        exit(1)
    except FileNotFoundError as e:
        print('Could not open the file. \n%s' % e)
        exit(2)

    if not disable_summary:
        graph.print_summary()

    print('Score: %.2f' % graph.score())

    if not disable_graph_rendering:
        graph.render()

    if debug_mode:
        sep = '#' + '-' * 78 + '#'
        print(sep, file=sys.stderr)
        print('DEBUG OUTPUT', file=sys.stderr)
        print(sep, file=sys.stderr)
        print('Forward References:', graph.forward_references, file=sys.stderr)
        print('Backward References:',
              graph.backward_references,
              file=sys.stderr)
        print('A priori Concepts:', graph.a_priori_concepts, file=sys.stderr)
        print('Emerging Concepts:', graph.emerging_concepts, file=sys.stderr)
        print(sep, file=sys.stderr)
Beispiel #8
0
    def parse(self, filename: str, graph: ConceptGraph):
        tree = ElementTree.parse(filename)
        root = tree.getroot()

        if self.resolve_coreferences:
            nlp_ = spacy.load('en')
            neuralcoref.add_to_pipe(nlp_)

            def nlp(text: str):
                # noinspection PyProtectedMember
                return nlp_(nlp_(text)._.coref_resolved)
        else:
            nlp_ = spacy.load('en')

            def nlp(text: str):
                return nlp_(text)

        for section in root.findall('section'):
            section_title = section.find('title').text
            section_title = section_title.lower()

            if section_title == 'references':
                continue

            section_text = section.find('text').text
            section_text = section_text.lower()

            for sent in nltk.sent_tokenize(section_text):
                sent = sent.strip()
                sent = nlp(sent)
                sent = nlp(' '.join([
                    tok.text
                    for tok in filter(lambda tok: tok.tag_ not in {'RB'}, sent)
                ]))

                self.identify_emerging_concepts(sent, section_title, graph)

                annotation = self.client.annotate(sent.text)

                for sentence in annotation['sentences']:
                    parse_tree = nltk.Tree.fromstring(sentence['parse'])
                    # parse_tree.pretty_print()

                    for subject, verb, object_ in self.parse_the_parse_tree(
                            parse_tree):
                        subject_tags = list(
                            filter(
                                lambda token_tag: token_tag[1] not in
                                {'DET', 'DT'}, nltk.pos_tag(subject)))
                        object_tags = list(
                            filter(
                                lambda token_tag: token_tag[1] not in
                                {'DET', 'DT'}, nltk.pos_tag(object_)))

                        subject = ' '.join(
                            [token for token, tag in subject_tags])
                        object_ = ' '.join(
                            [token for token, tag in object_tags])

                        graph.add_relation(Node(subject),
                                           Relation(' '.join(verb)),
                                           Node(object_),
                                           Section(section_title))

                        self.add_implicit_references(subject_tags,
                                                     Section(section_title),
                                                     graph)
                        self.add_implicit_references(object_tags,
                                                     Section(section_title),
                                                     graph)
Beispiel #9
0
    def parse(self, filename: str, graph: ConceptGraph):
        """Parse a file and build up a graph structure.

        :param filename: The file to parse.
        :param graph: The graph instance to add the nodes and edges to.
        """
        tree = ElementTree.parse(filename)
        root = tree.getroot()

        if self.resolve_coreferences:
            nlp_ = spacy.load('en')
            neuralcoref.add_to_pipe(nlp_)

            def nlp(text: str):
                # noinspection PyProtectedMember
                return nlp_(nlp_(text)._.coref_resolved)
        else:
            nlp_ = spacy.load('en')

            def nlp(text: str):
                return nlp_(text)

        for section in root.findall('section'):
            section_title = section.find('title').text
            section_title = section_title.lower()

            if section_title == 'references':
                continue

            section_text = section.find('text').text
            section_text = section_text.lower()

            span = nlp(section_text)
            # self.chunk(span)

            for sent in span.sents:
                s = nlp(' '.join([
                    tok.text
                    for tok in filter(lambda tok: tok.tag_ not in {'RB'}, sent)
                ]))
                self.identify_emerging_concepts(s, section_title, graph)

                if len(s.text.strip()) > 0:
                    annotation = self.client.annotate(s.text.strip())

                    for sentence in annotation['sentences']:
                        for triple in sentence['openie']:
                            subject, relation, object_ = triple[
                                'subject'], triple['relation'], triple[
                                    'object']

                            if self.filter_triple(subject, relation, object_):
                                graph.add_relation(subject, relation, object_,
                                                   section_title)

                                subject_tags = self.strip_determiners(
                                    nltk.pos_tag(nltk.word_tokenize(subject)))
                                relation_tags = self.strip_determiners(
                                    nltk.pos_tag(nltk.word_tokenize(relation)))
                                object_tags = self.strip_determiners(
                                    nltk.pos_tag(nltk.word_tokenize(object_)))

                                subject = ' '.join(
                                    [token for token, tag in subject_tags])
                                relation = ' '.join(
                                    [token for token, tag in relation_tags])
                                object_ = ' '.join(
                                    [token for token, tag in object_tags])

                                graph.add_relation(Node(subject),
                                                   Relation(relation),
                                                   Node(object_),
                                                   Section(section_title))

                                self.add_implicit_references(
                                    subject_tags, Section(section_title),
                                    graph)
                                self.add_implicit_references(
                                    object_tags, Section(section_title), graph)
Beispiel #10
0
    def parse(self, filename: str, graph: ConceptGraph):
        """Parse a file and build up a graph structure.

        :param filename: The file to parse.
        :param graph: The graph instance to add the nodes and edges to.
        """
        tree = ElementTree.parse(filename)
        root = tree.getroot()

        if self.resolve_coreferences:
            nlp_ = spacy.load('en')
            neuralcoref.add_to_pipe(nlp_)

            def nlp(text: str):
                # noinspection PyProtectedMember
                return nlp_(nlp_(text)._.coref_resolved)
        else:
            nlp_ = spacy.load('en')

            def nlp(text: str):
                return nlp_(text)

        for section in root.findall('section'):
            section_title = section.find('title').text
            section_title = section_title.lower()

            if section_title == 'references':
                continue

            section_text = section.find('text').text
            section_text = section_text.lower()

            span = nlp(section_text)

            for sent in span.sents:
                # TODO: Use CoreNLP parse tree
                tags = self.get_tagged(sent)
                parse_tree = self.chunker.parse(nltk.Tree('S', children=tags))

                # TODO: Does the sentence need to be noun chunked?
                # Find the subject of the sentence
                subject = Node(self.get_subject(sent))

                graph.add_node(subject, section_title)
                self.add_gerund_phrase(subject, section_title, sent, graph)
                self.identify_emerging_concepts(sent, section_title, graph)

                # Add other noun phrases to the graph
                for np in parse_tree.subtrees(lambda t: t.label() == 'NP'):
                    tags = np.leaves()

                    if tags[0][1] == 'DT':
                        tags = tags[1:]

                    entity = Node(' '.join([token for token, tag in tags]))

                    graph.add_node(entity, section_title)
                    graph.add_edge(subject, entity)

                    if self.implicit_references:
                        self.add_implicit_references(tags, section_title,
                                                     graph)
Beispiel #11
0
from qcd.concept_graph import ConceptGraph
from qcd.xml_parser import XMLParser

if __name__ == '__main__':
    parser = XMLParser(annotate_edges=False, implicit_references=False)
    filename = 'bread.xml'

    graph = ConceptGraph(parser=parser, mark_references=False)
    graph.parse(filename)
    graph.render('bread_graph-sections_only-simple', view=False)

    graph = ConceptGraph(parser=parser, mark_references=True)
    graph.parse(filename)
    graph.render('bread_graph-sections_only-reference_marking', view=False)

    parser.implicit_references = True

    graph = ConceptGraph(parser=parser, mark_references=False)
    graph.parse(filename)
    graph.render('bread_graph-sections_only-implicit_references', view=False)

    graph = ConceptGraph(parser=parser, mark_references=True)
    graph.parse(filename)
    graph.render('bread_graph-sections_only', view=False)
from qcd.concept_graph import ConceptGraph
from qcd.xml_parser import XMLParser

if __name__ == '__main__':
    filename = 'bread-concepts_marked.xml'

    parser = XMLParser(implicit_references=False)

    graph = ConceptGraph(parser, mark_references=False)
    graph.parse(filename)
    graph.render(filename='bread_graph-simple', view=False)

    graph = ConceptGraph(parser, mark_references=True)
    graph.parse(filename)
    graph.render(filename='bread_graph-reference_marking', view=False)

    parser.implicit_references = True

    graph = ConceptGraph(parser, mark_references=False)
    graph.parse(filename)
    graph.render(filename='bread_graph-implicit_references', view=False)

    graph = ConceptGraph(parser, mark_references=True)
    graph.parse(filename)
    graph.render(filename='bread_graph', view=False)