コード例 #1
0
ファイル: readgen.py プロジェクト: vaig-2019/core
class ReadGen(QuestionGenerator):
    '''Reading questin generator.'''
    def __init__(self, crawler):
        #put more required models for initilizing the generator
        super(ReadGen, self).__init__()
        self.crawler = crawler
        self.dependparser = DependencyParser()
        self.pos = POS()
        self.ner = NER()
        self.sen_tokenizer = SentenceTokenizer()
        self.paraphrase = Paraphrase(self.pos, self.dependparser)

    def fit(self):
        '''Fit or reload prediction model.'''
        pass

    def predict(self, request_body):
        '''Predict a list of questions for the docment.

        Args:
            - request_body: a dict contains either key 'text' or 'url' and other information for the request

        Returns:
            - error: if there is an error during generation
            - questions: A list of question instances that have been defined in common/question.py
        '''
        if 'text' in request_body:
            text = request_body['text']
            return None, self.transform(text)
        elif 'url' in request_body:
            url = request_body['url']
            text = self.crawler.get_text(url)
            return None, self.transform(text)
        else:
            return Errors.INVALID_REQUEST, None

    def transform(self, text):
        sentences = self.sen_tokenizer.transform(text)
        print(sentences)
        docgraph = DocumentGraph(text)
        res = []
        for i in range(len(sentences)):
            entities = []

            print('process root sentence')
            print('get paraphrase...')
            paraphrases = self.paraphrase.transform(sentences[i])
            _res = {'generate_question': [], 'synonyms': []}
            print('generate questions...')
            for para in paraphrases['sentence_paraphrases']:
                print(para)
                _res['generate_question'].append({
                    'text':
                    para,
                    'question':
                    generate_question(para, self.dependparser, self.pos,
                                      self.ner, entities)
                })
            # _res['synonyms'] = paraphrases['word_paraphrases']
            res.append(_res)

            print('process name entity sentences')
            print('replace all entity...')
            for e in docgraph.get_sentence_entities(i):
                if (e.refer == None):
                    continue
                while (e.refer.refer != None):
                    e.refer = e.refer.refer
                sentences[i] = sentences[i].replace(e.text, e.refer.text, 1)
                entities.append(e.refer.text)
            print('get paraphrase...')
            paraphrases = self.paraphrase.transform(sentences[i])
            _res = {'generate_question': [], 'synonyms': []}
            print('generate questions...')
            for para in paraphrases['sentence_paraphrases']:
                _res['generate_question'].append({
                    'text':
                    para,
                    'question':
                    generate_question(para, self.dependparser, self.pos,
                                      self.ner, entities)
                })
            # _res['synonyms'] = paraphrases['word_paraphrases']
            res.append(_res)
        return res
コード例 #2
0
class ReadingGenerator(object):
    def __init__(self):
        self.short_answer = ShortAnswerGenerator()
        self.multi_choice = MultiChoiceGenerator()
        self.yes_no = YesNoGenerator()
        self.pos = POS()
        self.ner = NER()
        self.dependparser = DependencyParser()
        self.sentence_tokenizer = SentenceTokenizer()
        self.word_tokenizer = WordTokenizer()
        self.coreference = Coreference()

    def preprocess_sentence(self, raw_sentences):
        """
            $2  -> 2 dollars
            2%  -> 2 percents
            £10 -> 10 euro
            n't -> not
        """
        res = []
        for sentence in raw_sentences:
            words = sentence.split(' ')
            for i in range(len(words)):
                if re.match('\$(\d)*', words[i]) != None:
                    words[i] = re.match('\$(\d)*', words[i]).group(1) + ' dollar'
                if re.match('£(\d)*', words[i]) != None:
                    words[i] = re.match('£(\d)*', words[i]).group(1) + ' euro'
                if re.match('(\d*)%', words[i]) != None:
                    words[i] = re.match('(\d*)%', words[i]).group(1) + ' percent'
            res.append(' '.join(words))
        return res


    def prepare_ners(self, sentences):
        doc_ners = {
            'LOCATION': [],
            'PERSON': [],
            'ORGANIZATION': [],
            'COUNTRY': [],
            'TIME': [],
            'DATE': [],
            'NATIONALITY': [],
        }
        for sentence in sentences:
            raw_pos = self.pos.transform(sentence)
            if (len(raw_pos) == 0):
                continue
            raw_ner = self.ner.transform(sentence)
            words, xpos = get_pos_conll(raw_pos)
            ners = get_ner_conll(raw_ner)
            i = 0
            while i < len(words):
                if ners[i] in doc_ners.keys():
                    ner = ners[i]
                    word = ''
                    while (i < len(words)) and (ners[i] == ner):
                        word += ' ' + words[i]
                        i += 1
                    doc_ners[ner].append(word[1:])
                else:
                    i += 1
        return doc_ners


    def prepare_raw_sentences(self, raw_sentences):
        raw_sentences_data = []
        for i, sentence in enumerate(raw_sentences):
            # process nlp tasks
            raw_pos = self.pos.transform(sentence)
            if (len(raw_pos) == 0):
                continue
            raw_dp = self.dependparser.predict({'text' : sentence})
            raw_ner = self.ner.transform(sentence)
            words, xpos = get_pos_conll(raw_pos)
            ners = get_ner_conll(raw_ner)
            dependency_trees = get_dp_conll(raw_dp, words, xpos, ners)
            # add to sentences data
            if len(dependency_trees) == 0:
                continue
            sentence_data = dependency_trees[0]
            raw_sentences_data.append((i, sentence_data))
        return raw_sentences_data


    def prepare_paraphrase_sentences(self, raw_sentences):
        paraphrase_sentences = []
        for i, sentence in raw_sentences:
            words = sentence['words']
            xpos = sentence['xpos']
            heads = sentence['heads']
            labels = sentence['labels']
            ners = sentence['ners']
            for paraphrase_sentence in generate_paraphrase(words, xpos, heads, labels, ners):
                # paraphrase_sentences.append((paraphrase_sentence, i))
                # process nlp tasks
                raw_pos = self.pos.transform(paraphrase_sentence)
                if (len(raw_pos) == 0):
                    continue
                raw_dp = self.dependparser.predict({'text' : paraphrase_sentence})
                raw_ner = self.ner.transform(paraphrase_sentence)
                words, xpos = get_pos_conll(raw_pos)
                ners = get_ner_conll(raw_ner)
                dependency_trees = get_dp_conll(raw_dp, words, xpos, ners)
                # add to sentences data
                sentence_data = dependency_trees[0]
                paraphrase_sentences.append((i, sentence_data))

        return paraphrase_sentences   


    def replace_coreference(self, document):
        """
            replace coref for PRP only
        """
        coref_sentences = []
        # get tokens
        sent_tokens = []
        sentences = self.sentence_tokenizer.transform(document)
        sentences = self.preprocess_sentence(sentences)
        for sentence in sentences:
            sent_tokens.append(self.word_tokenizer.transform(sentence))
        # get replace list
        rp_list = {}
        for i in range(len(sentences)):
            rp_list[str(i)] = {
                'rp_idxs'  : [],
                'rp_words' : []
            }
        _, corefers = self.coreference.transform(document)
        for coref in corefers:
            sent_id = coref.mention.sent_id
            if (coref.mention.end - coref.mention.start == 1) and (sent_tokens[sent_id][coref.mention.start].lower() in ['he', 'she', 'it', 'they']):
                target = " ".join(sent_tokens[coref.refer.sent_id][coref.refer.start:coref.refer.end])
                rp_list[str(sent_id)]['rp_idxs'].append(coref.mention.start)
                rp_list[str(sent_id)]['rp_words'].append(target)
        # gen coref sentences
        for i in range(len(sentences)):
            j = 0
            sentence = ""
            while j < len(sent_tokens[i]):
                if j in rp_list[str(i)]['rp_idxs']:
                    sentence += ' ' + rp_list[str(i)]['rp_words'][rp_list[str(i)]['rp_idxs'].index(j)]
                else:
                    sentence += ' ' + sent_tokens[i][j]
                j += 1
            sentence = sentence[1:]
            coref_sentences.append(sentence)
            # print(sentence)
        return coref_sentences
        
 
    def get_questions(self, document):
        """
            data: {
                raw_sentences = [],
                paraphrase_sentences = [],
                summerized_sentences = [],
            }
        """ 
        # prepare data
        data = {}
        raw_sentences = self.sentence_tokenizer.transform(document)
        coref_sentences = self.replace_coreference(document)
        data["ners"] = self.prepare_ners(coref_sentences)
        data["raw_sentences"] = self.prepare_raw_sentences(coref_sentences)
        data["paraphrase_sentences"] = self.prepare_paraphrase_sentences(data["raw_sentences"])
        # print(data["paraphrase_sentences"])
        # # generate questions
        questions = []
        questions += self.short_answer.get_questions(data)
        questions += self.multi_choice.get_questions(data)
        questions += self.yes_no.get_questions(data)
        # return 
        return questions
        
        
コード例 #3
0
ファイル: docgraph.py プロジェクト: vaig-2019/core
class DocumentGraph(object):
    '''Reading questin generator.'''
    def __init__(self, text):
        self.sentoken = SentenceTokenizer()
        self.wordtoken = WordTokenizer()
        self.ner = NER()
        self.pos = POS()
        self.depend = DependencyParser()
        self.corefer = Coreference()

        #initilizing...
        print('Initilizing...')
        self.text = text
        self.sent_tokens, self.sents, self.sent_token_toid, self.tokenid_to_sentid = self._build_token_id_mapping(
            text)
        self.kg, self.entities = self._build_knowledge_graph()

    def _build_token_id_mapping(self, text):
        print('Building index mapping...')
        sents = self.sentoken.transform(text)
        sent_tokens = []
        sent_token_toid = defaultdict(dict)
        tokenid_to_sentid = {}
        id_count = -1
        for send_id, s in enumerate(sents):
            tokens = self.wordtoken.transform(s)
            sent_tokens.append(tokens)
            for token_id, token in enumerate(tokens):
                id_count += 1
                sent_token_toid[send_id][token_id] = id_count
                tokenid_to_sentid[id_count] = (send_id, token_id)

        return sent_tokens, sents, sent_token_toid, tokenid_to_sentid

    def _get_token_id(self, sent_id, token_id):
        if token_id in self.sent_token_toid[sent_id]:
            return self.sent_token_toid[sent_id][token_id]
        else:
            raise ValueError('Sentence {} doesnt has tokenid {}'.format(
                sent_id, token_id))

    def _build_knowledge_graph(self):
        kg = KnowledgeGraph([
            'text', 'sent_id', 'token_id', 'pos', 'ner', 'begin_text',
            'end_text'
        ], [
            'string', 'short', 'short', 'string', 'string', 'string', 'string'
        ], ['dependency', 'connect_text', 'relation'],
                            ['string', 'string', 'string'])
        #begin_text, end_text is the text at begining or starting of sentence which is not belonging to any entity or their relations.
        #connec_text is the text connecting two entity

        print('======building coreference knowledge graph...')
        coref_kg = self._build_coreference_knowledge_graph()

        print('======Add sentences to knowledge graph...')
        sent_entities = {}
        for i, s in enumerate(self.sent_tokens):
            s_entities = self._add_sentence_to_knowledge_graph(i, s, kg)
            sent_entities[i] = s_entities

        print('======Combine DocumentKG and CoreferenceKG...')
        self._combine_dockg_and_corefkg(kg, coref_kg)
        return kg, sent_entities

    def _combine_dockg_and_corefkg(self, kg, coref_kg):
        for e in coref_kg.get_edges_by_property('relation', 'coreference'):
            mention = e[0]
            mention_sent_id = coref_kg.get_vertex_property_value(
                mention, 'psent')
            mention_token_id = coref_kg.get_vertex_property_value(
                mention, 'pstart')
            #mention_token_end_id = coref_kg.get_vertex_property_value(mention, 'pend')

            refer = e[1]
            refer_sent_id = coref_kg.get_vertex_property_value(refer, 'psent')
            refer_token_id = coref_kg.get_vertex_property_value(
                refer, 'pstart')
            #refer_token_end_id = coref_kg.get_vertex_property_value(mention, 'pend')

            v_start_id = self._get_token_id(mention_sent_id, mention_token_id)
            v_end_id = self._get_token_id(refer_sent_id, refer_token_id)
            kg.add_new_edge_with_properties(v_start_id, v_end_id,
                                            [('relation', 'coreference')])

    def _build_coreference_knowledge_graph(self):
        kg = KnowledgeGraph(
            ['text', 'psent', 'pstart', 'pend', 'position'],
            ['string', 'short', 'short', 'short', 'vector<short>'],
            ['relation'], ['string'])

        print('======Build coreference knowledge graph...')
        _, corefers = self.corefer.transform(self.text)

        for refer in corefers:
            refer_text = self._get_text_from_coref(refer.refer)
            mention_text = self._get_text_from_coref(refer.mention)
            print(mention_text + '->' + refer_text +
                  '-> add a vertex: text: ' + refer_text +
                  ', position: {}'.format(refer.refer.to_list()))
            v_refer = self._get_refer_by_position(*refer.refer.to_list(), kg)
            if len(v_refer) > 0:
                print('Same refer to', refer.refer.to_list())
                v_refer = v_refer[0]
            else:
                prefer = refer.refer
                v_refer = kg.add_new_vertex_with_properties([
                    ('text', refer_text), ('psent', prefer.sent_id),
                    ('pstart', prefer.start), ('pend', prefer.end),
                    ('position', prefer.to_list())
                ])
            mention = refer.mention
            v_mention = kg.add_new_vertex_with_properties([
                ('text', mention_text), ('psent', mention.sent_id),
                ('pstart', mention.start), ('pend', mention.end),
                ('position', mention.to_list())
            ])
            kg.add_new_edge_with_properties(v_mention, v_refer,
                                            [('relation', 'coreference')])
        return kg

    def _is_added_to_knowledge_graph(self, sent_id, start, end, kg):
        return len(
            kg.get_vertices_by_property('position',
                                        [[sent_id, start, end]])) > 0

    def _get_refer_by_position(self, sent_id, start, end, kg):
        return kg.get_vertices_by_property('position', [[sent_id, start, end]])

    def _get_text_from_coref(self, coref):
        return self._get_sub_text(coref.sent_id, coref.start, coref.end)

    def _get_sub_text(self, sent_id, start, end):
        #TODO recheck correct and speed up by save some data
        return ' '.join(self.sent_tokens[sent_id][start:end])

    def _add_sentence_to_knowledge_graph(self, sent_id, tokens, kg):
        #TODO continue here, levarage NLP analyses to extend knowledge graph
        #dcoref = corefers.to_dict_sent_coref()
        sent = ' '.join(tokens)
        print('=======Process: ', sent)
        skg = self._present_sentence_as_knowledge_graph(sent)
        skg, entities = self._build_entities_relations_graph_from_sentence_graph(
            skg, tokens)
        self._join_sentence_graph_to_doc_graph(sent_id, tokens, skg, kg)

        #map entities to new vertices
        sent_entities = []
        for entity in entities:
            v0 = self._get_token_id(sent_id, int(entity[0]))
            new_e = Entity([kg.get_vertex(v0)])
            for v in entity[1:]:
                new_id = self._get_token_id(sent_id, int(v))
                new_e.append(kg.get_vertex(new_id))
            sent_entities.append(new_e)
        print('#entites in sentence: ', len(sent_entities))
        return sent_entities

    def _join_sentence_graph_to_doc_graph(self, sent_id, tokens, skg, kg):

        #Copy vertices
        v_pnames = list(skg.vertex_pnames)
        v_pnames.append('sent_id')
        for token_id, token in enumerate(tokens):
            pvalues = skg.get_vetex_properties_values(skg.get_vertex(token_id),
                                                      skg.vertex_pnames)
            #v_pnames = skg.vertex_pnames
            #v_pnames.append('sent_id')
            pvalues.append(sent_id)
            new_v = kg.add_new_vertex_with_properties(zip(v_pnames, pvalues))
            v_id = self._get_token_id(sent_id, token_id)
            assert v_id == int(
                new_v), 'Index is not fit between document and graph??'

        #copy edge
        e_pnames = list(skg.edge_pnames)
        for e in skg.get_edges():
            pvalues = skg.get_edge_properties_values(e, skg.edge_pnames)
            new_e = kg.add_new_edge_with_properties(int(e[0]), int(e[1]),
                                                    zip(e_pnames, pvalues))

        print('Doc graph infos: ', kg.get_description())

    def _present_sentence_as_knowledge_graph(self, sent):
        pos = self.pos.transform(sent)
        dp = self.depend.transform(sent)
        ner = self.ner.transform(sent)

        skg = KnowledgeGraph(
            ['text', 'token_id', 'pos', 'ner', 'begin_text', 'end_text'],
            ['string', 'short', 'string', 'string', 'string', 'string'],
            ['dependency', 'connect_text'], ['string', 'string'])
        for token_id, ptag in enumerate(pos[0]):
            assert len(ner[0]) == len(
                pos[0]), 'POS and NER using different tokens??'
            skg.add_new_vertex_with_properties([('text', ptag.word),
                                                ('token_id', token_id),
                                                ('pos', ptag.pos_tag),
                                                ('ner',
                                                 ner[0][token_id].ner_tag)])

        for d in dp[0]:
            skg.add_new_edge_with_properties(d.root_index, d.target_index,
                                             [('dependency', d.label)])

        return skg

    def _build_entities_relations_graph_from_sentence_graph(self, skg, tokens):
        entities = self._get_entities_from_sentence_graph(skg)

        entities = sorted(entities, key=lambda e: int(e[0]))
        print('Entities text: ', self._get_entities_text(skg, entities))

        #add relation_text ege is the text between entities found
        for pi, e in enumerate(entities[1:]):
            pe = entities[pi]
            start_id = int(pe[-1]) + 1
            end_id = int(e[0])
            ctext = ' '.join(tokens[start_id:end_id])
            if ctext != '':
                skg.add_new_edge_with_properties(int(pe[0]), int(e[0]),
                                                 [('connect_text', ctext)])
                print(pi, pi + 1, ctext)

        #add redundant text into graph
        first_entity_vertex = entities[0][0]
        if int(first_entity_vertex) > 0:
            begin_text = ' '.join(tokens[0:int(first_entity_vertex)])
            skg.add_vertex_property(first_entity_vertex, 'begin_text',
                                    begin_text)
            print('begin_text:', begin_text)
        last_entity_vertex = entities[-1][-1]
        if int(last_entity_vertex) < len(tokens) - 1:
            end_text = ' '.join(tokens[int(last_entity_vertex) + 1:])
            if end_text.strip() != '.':
                skg.add_vertex_property(last_entity_vertex, 'end_text',
                                        end_text)
                print('end_text:', end_text)

        return skg, entities

    def _get_entities_from_sentence_graph(self, skg):
        entities = []
        used_vertices = []
        #Add entity type (from NER) DATE, TIME, MONEY, LOCATION as entity
        ners = skg.get_vertices_by_property(
            'ner', ['DATE', 'TIME', 'MONEY', 'LOCATION'])

        entities.extend(
            self._group_adjacent_vertices_same_property_to_entity(
                ners, skg, 'ner'))
        used_vertices.extend(ners)

        #compound as entities
        compound_edges = skg.get_edges_by_property('dependency', 'compound')
        for start_v, end_v in compound_edges:
            if start_v in used_vertices:
                continue
            #expanding to get full compound including the determiner
            entity_vertices = skg.expand_by_relation_types(
                start_v, ['compound', 'det'])
            entity_vertices = sorted(entity_vertices, key=lambda v: int(v))
            entities.append(Entity(entity_vertices))
            used_vertices.extend(entity_vertices)

        #NN, NNP, NNS, PRP as entities
        nouns = skg.get_vertices_by_property('pos',
                                             ['NN', 'NNP', 'NNS', 'PRP'])
        for v in nouns:
            if v in used_vertices:
                continue
            entity_vertices = skg.expand_by_relation_types(
                v, ['det', 'amod', 'nummod', 'nmod:of', 'nmod:poss'],
                infinite_deep=True)
            entity_vertices = sorted(entity_vertices, key=lambda v: int(v))
            entities.append(Entity(entity_vertices))
            used_vertices.extend(entity_vertices)

        return self._filter_sub_entities(entities)

    def _filter_sub_entities(self, entities):
        entities = sorted(entities, key=lambda e: int(e[-1]))
        ret_entities = []
        for i, e in enumerate(entities):
            is_sub_entity = False
            for check in entities[i + 1:]:
                if e != check and set(e) < set(check):
                    is_sub_entity = True
                    break
            if not is_sub_entity:
                ret_entities.append(e)
        return ret_entities

    def _get_entities_text(self, kg, entities):
        entities_text = []
        for e in entities:
            entities_text.append(' '.join(
                kg.get_vertices_property_value(e, 'text')))
        return entities_text

    def _group_adjacent_vertices_same_property_to_entity(
            self, vertices, kg, pname):
        if len(vertices) == 0:
            return []

        vertices = sorted(vertices, key=lambda v: int(v))
        entities = []

        last_entity = Entity([vertices[0]])
        entities.append(last_entity)

        last_pvalue = kg.get_vertex_property_value(vertices[0], pname)
        for v in vertices[1:]:
            v_pvalue = kg.get_vertex_property_value(v, pname)
            if v_pvalue == last_pvalue:
                last_entity.append(v)
            else:
                last_pvalue = v_pvalue
                last_entity = Entity([v])
                entities.append(last_entity)
        return entities

    def get_sentence_entities(self, sent_id):
        entities = self.entities[sent_id]
        for e in entities:
            e.text = self.get_entity_text(e)
            e.refer = self.get_entity_refer(e)
        return entities

    def get_entity_text(self, entity):
        return ' '.join(self.kg.get_vertices_property_value(entity, 'text'))

    def get_entity_refer(self, entity):
        for v in entity:
            for e in v.out_edges():
                relation = self.kg.get_edge_property_value(e, 'relation')
                if relation == 'coreference':
                    refer_entity = self.get_entity_at_token_id(int(e.target()))
                    if refer_entity is not None:
                        refer_entity.refer = self.get_entity_refer(
                            refer_entity)
                        refer_entity.text = self.get_entity_text(refer_entity)
                    return refer_entity
        return None

    def get_entity_at_token_id(self, token_id):
        sent_id, sent_token_id = self.tokenid_to_sentid[token_id]
        for e in self.entities[sent_id]:
            if int(e[-1]) >= token_id:
                return e
        return None
コード例 #4
0
class QuestionGenerator(object):
    '''Reading questin generator.'''
    def __init__(self):
        #put more required models for initilizing the generator
        self.dependparser = DependencyParser()
        self.pos = POS()
        self.ner = NER()
        self.sen_tokenizer = SentenceTokenizer()
        self.paraphrase = Paraphrase(self.pos, self.dependparser)
        self.coreference = Coreference()

    def gen_QA(self, sentences):
        res = []
        for i in range(len(sentences)):
            entities = []
            print('process root sentence')
            print('get paraphrase...')
            # paraphrases = self.paraphrase.transform(sentences[i])
            # print('generate questions...')
            # for para in paraphrases['sentence_paraphrases']:
            for para in [sentences[i]]:
                res += generate_question_QA(para, self.dependparser, self.pos,
                                            self.ner, entities)
        res_question = ""
        res_answer = ""
        res_rank = 0
        for r in res:
            rank = r["rank"]
            question = r["question"]
            answer = r["answer"]
            if (rank > res_rank) or ((rank == res_rank) and
                                     (len(question) > len(res_question))):
                res_question = question
                res_rank = rank
                res_answer = answer
        return (res_question, res_answer)

    def gen_4FF(self, sentences):
        """
        choose word of type
        Noun: 
        Verb:
        Adj :
        """
        res = generate_question_4FF(sentences, self.dependparser, self.pos,
                                    self.ner)
        if res[0] == None:
            return res[1]
        elif res[1] == None:
            return res[0]
        return res[random.choice([0, 1])]

    def gen_FF(self, sentences):
        """
        choose word of type
        Noun: 
        Verb:
        Adj :
        """
        res = generate_question_FF(sentences, self.dependparser, self.pos,
                                   self.ner)
        idxs = []
        for i in range(len(res)):
            if res[i] != None:
                idxs.append(i)
        if len(idxs) == 0:
            return None
        return res[random.choice(idxs)]

    def transform(self, text):
        """
        TODO
            process name entity sentences
            replace all entity
        """
        sentences = self.sen_tokenizer.transform(text)
        res = []
        print("generate QA question")
        QA_question = self.gen_QA(sentences)
        res.append(("QA", QA_question))
        print("generate 4FF question")
        FF4_question = self.gen_4FF(sentences)
        res.append(("FF4", FF4_question))
        print("generate FF question")
        FF_question = self.gen_FF(sentences)
        res.append(("FF", FF_question))
        return res