Esempio n. 1
0
 def gen_rel_graph(tagged):
     entity_fragments = entity_extraction.extract_entities(tagged)
     edges = entity_extraction.generate_edges(entity_fragments)
     non_parsed_graph = {
         'tokens': [t for t, _, _ in tagged],
         'edgeSet': edges
     }
     parsed_graph = relparser.classify_graph_relations(
         [non_parsed_graph])
     return parsed_graph
Esempio n. 2
0
def construct_relations_graph(input_text):
    logger.debug("Tagging: {}".format(input_text))
    tagged = get_tagged_from_server(input_text)
    logger.debug("Tagged: {}".format(tagged))
    logger.debug("Extract entities")
    entity_fragments = entity_extraction.extract_entities(tagged)
    edges = entity_extraction.generate_edges(entity_fragments)
    non_parsed_graph = {'tokens': [t for t, _, _ in tagged], 'edgeSet': edges}
    parsed_graph = relparser.classify_graph_relations([non_parsed_graph])[0]
    return parsed_graph
Esempio n. 3
0
def test_load_relationparser():
    relparser = parser.RelParser("model_ContextWeighted", models_foldes="../trainedmodels/")
    tagged = [('Star', 'O', 'NNP'), ('Wars', 'O', 'NNP'), ('VII', 'O', 'NNP'), ('is', 'O', 'VBZ'), ('an', 'O', 'DT'),
              ('American', 'MISC', 'JJ'), ('space', 'O', 'NN'), ('opera', 'O', 'NN'), ('epic', 'O', 'NN'),
              ('film', 'O', 'NN'), ('directed', 'O', 'VBN'), ('by', 'O', 'IN'), ('J.', 'PERSON', 'NNP'),
              ('J.', 'PERSON', 'NNP'), ('Abrams', 'PERSON', 'NNP'), ('.', 'O', '.')]
    entity_fragments = entity_extraction.extract_entities(tagged)
    edges = entity_extraction.generate_edges(entity_fragments)
    non_parsed_graph = {'tokens': [t for t, _, _ in tagged],
                    'edgeSet': edges}
    graph_with_relations = relparser.classify_graph_relations(non_parsed_graph)
    assert [e['kbID'] for e in graph_with_relations['edgeSet']] == ['P800', 'P136', 'P136']
def graphs_for_evaluation(graphs, graphs_tagged):
    for_evaluation = []
    for i, g in enumerate(tqdm.tqdm(graphs, ascii=True, ncols=100)):
        for edge in g["edgeSet"]:
            new_g = {"edgeSet": [edge], "tokens": g['tokens']}
            entities = [
                ne for ne, t in entity_extraction.extract_entities(
                    graphs_tagged[i])
            ]
            entities += [edge['left'], edge['right']]
            new_g['vertexSet'] = [{'tokenpositions': ne} for ne in entities]
            new_g['edgeSet'].extend(get_negative_edges(new_g, limit=6))
            for_evaluation.append(new_g)
    return for_evaluation
    def __init__(self, file=None, string=None, url='http://localhost:9000'):
        """
        NLP main handler based on Stanford CoreNLP and emnlp2017-relation-extraction and Baidu API
        The input should be either string or file.
        :param file:
        :param string:
        """
        # run Stanford CoreNLP server at localhost:9000
        # java -mx8g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 100000
        self.corenlp = StanfordCoreNLP(url)
        self.corenlp_properties = {
            'annotators': 'tokenize, pos, ner',
            'outputFormat': 'json'
        }

        # get input sentences and the whole passage
        if file is not None:
            with codecs.open(file, 'r', encoding='utf-8') as infile:
                self.original_passage = ' '.join(
                    [line for line in infile.readlines()])
        elif string is not None:
            self.original_passage = string
        else:
            raise Exception("NLP main handler input required!")
        self.original_sentences = nltk.sent_tokenize(self.original_passage)

        # paraphrase the passage
        self.paraphraser = Paraphraser()
        self.paraphrased_passage = self.paraphraser.paraphrase_passage(
            string=self.original_passage)
        self.paraphrased_sentences = nltk.sent_tokenize(
            self.paraphrased_passage)

        # set keras model params
        keras_models.model_params[
            'wordembeddings'] = "resources/embeddings/glove/glove.6B.50d.txt"
        self.relparser = RelParser("model_ContextWeighted",
                                   models_foldes="trainedmodels/")

        # tag coreference by CoreNLP
        self.original_tagged_coreference = self.corenlp.annotate(
            self.original_passage,
            properties={
                'timeout': '60000',
                'annotators': 'coref',
                'outputFormat': 'json'
            })
        self.paraphrased_tagged_coreference = self.corenlp.annotate(
            self.paraphrased_passage,
            properties={
                'timeout': '60000',
                'annotators': 'coref',
                'outputFormat': 'json'
            })

        # tag input sentences by CoreNLP
        self.taggeds_sentences = []
        for sentence in self.paraphrased_sentences:
            corenlp_output = \
                self.corenlp.annotate(sentence, properties=self.corenlp_properties).get("sentences", [])[0]
            # TODO: optimazation, DO NOT require for unnecessary data.
            self.taggeds_sentences.append([(t['originalText'], t['ner'],
                                            t['pos'])
                                           for t in corenlp_output['tokens']])

        # extract entity fragments
        self.entity_fragments = []
        for tagged_plain_sentence in self.taggeds_sentences:
            self.entity_fragments.append(
                entity_extraction.extract_entities(tagged_plain_sentence))

        # extract edges
        self.edges = []
        for entity_fragment in self.entity_fragments:
            self.edges.append(
                entity_extraction.generate_edges(entity_fragment))

        # construct non_parsed_graphs for relation extraction
        self.non_parsed_graphs = []
        for tagged, edges in zip(self.taggeds_sentences, self.edges):
            self.non_parsed_graphs.append({
                'tokens': [t for t, _, _ in tagged],
                'edgeSet': edges
            })

        # do relation extraction
        self.parsed_graphs = []
        for non_parsed_graph in self.non_parsed_graphs:
            self.parsed_graphs.append(
                self.relparser.classify_graph_relations(non_parsed_graph))

        # get valid relations
        self.valid_relations = self.get_valid_relations_from_parsed_graphs(
            self.parsed_graphs)

        # build NER complete set
        self.NER_tags = self.build_ner_tagged_set(self.taggeds_sentences)