Esempio n. 1
0
    def read_items(self):
        questions_path = self.file_path('questions.tsv.gz')
        items = dict()
        for line in self.read_tsv(questions_path, is_gzip=True):
            id = line[0]
            title = unidecode(line[1])
            if len(line) > 2:
                body = unidecode(line[2])
            else:
                self.logger.info('Question has no body! {}'.format(id))
                body = title

            answer = unidecode(line[3]).strip() if len(line) > 3 else None
            duplicates = line[4].split(',') if len(line) > 4 else []

            if self.lowercased:
                title = title.lower()
                body = body.lower()
                answer = answer.lower() if answer else None

            ti = TextItem(title)
            ti.metadata['id'] = '{}_title'.format(id)
            ti.metadata['duplicates'] = duplicates
            items[ti.metadata['id']] = ti
            ti = TextItem(body)
            ti.metadata['id'] = '{}_body'.format(id)
            items[ti.metadata['id']] = ti
            if answer:
                ti = TextItem(answer)
                ti.metadata['id'] = '{}_answer'.format(id)
                items[ti.metadata['id']] = ti

        return items
Esempio n. 2
0
def _get_text_item(text, id):
    question_tokens = [Token(t) for t in text.split()]
    question_sentence = Sentence(' '.join([t.text for t in question_tokens]),
                                 question_tokens)
    ti = TextItem(question_sentence.text, [question_sentence])
    ti.metadata['id'] = id
    return ti
Esempio n. 3
0
    def read(self):
        answers = OrderedDict()
        with open(path.join(self.archive_path, 'document_passages.json'), 'r') as f:
            for document_id, passages in json.loads(f.read()).items():
                for passage_id, passage_text in passages.items():
                    answer_ti = TextItem(passage_text.lower() if self.lowercased else passage_text)
                    answer_ti.metadata['id'] = 'answer-{}-{}'.format(document_id, passage_id)
                    answers['{}_{}'.format(document_id, passage_id)] = answer_ti

        train = self.read_split('train', answers)
        valid = self.read_split('dev', answers)
        test = self.read_split('test', answers)

        questions = [qa.question for qa in (train.qa + valid.qa + test.qa)]
        answers = train.answers + valid.answers + test.answers

        return Archive(self.name, train, valid, [test], questions, answers)
Esempio n. 4
0
    def read_split(self, name, answers):
        datapoints = []
        split_answers = []

        with open(path.join(self.archive_path, '{}.tsv'.format(name)), 'r') as f:
            next(f)
            for l in f:
                qid, question, doc_id, _, relevant_passages = l.strip().split('\t')

                question_ti = TextItem(question.lower() if self.lowercased else question)
                question_ti.metadata['id'] = 'question-{}'.format(qid)

                pool = [a for (k, a) in answers.items() if k.startswith('{}_'.format(doc_id))]
                np.random.shuffle(pool)
                ground_truth = [answers[doc_id + '_' + a_id] for a_id in relevant_passages.split(',')]

                datapoints.append(QAPool(question_ti, pool, ground_truth))
                split_answers += pool

        return Data('wikipassageqa / {}'.format(name), datapoints, split_answers)
Esempio n. 5
0
def get_text_item(text):
    """Converts a text into a tokenized text item

    :param text:
    :return:
    """
    if config['data']['lowercased']:
        text = text.lower()
    question_tokens = [Token(t) for t in word_tokenize(text)]
    question_sentence = Sentence(' '.join([t.text for t in question_tokens]),
                                 question_tokens)
    return TextItem(question_sentence.text, [question_sentence])
    def read_items(self):
        questions_path = self.file_path('questions.tsv.gz')
        items = dict()
        for line in self.read_tsv(questions_path, is_gzip=True):
            id = line[0]
            title = unidecode(line[1])
            if len(line) > 2:
                body = unidecode(line[2])
            else:
                self.logger.info('Question has no body! {}'.format(id))
                body = title

            if self.lowercased:
                title = title.lower()
                body = body.lower()

            # text = '{} {}'.format(title, body)
            # tokens = [Token(t) for t in text.split()]
            # ti = TextItem(text, tokens)
            # ti.metadata['id'] = id
            # items[id] = ti

            ti = TextItem(title)
            ti.metadata['id'] = '{}_title'.format(id)
            items[ti.metadata['id']] = ti
            ti = TextItem(body)
            ti.metadata['id'] = '{}_body'.format(id)
            items[ti.metadata['id']] = ti

        return items
Esempio n. 7
0
def _get_text_item(text, id):
    ti = TextItem(text)
    ti.metadata['id'] = id
    return ti
Esempio n. 8
0
def convert_input(tokens_all, vocab):
    # input convert
    tokens_all = [Token(token) for token in tokens_all if token in vocab]
    sent_ti = TextItem(' '.join(t.text for t in tokens_all), tokens_all)
    return sent_ti
def _get_text_item(text, id):
    question_tokens = [Token(t) for t in text.split()]
    question_sentence = Sentence(' '.join([t.text for t in question_tokens]), question_tokens)
    ti = TextItem(question_sentence.text, [question_sentence])
    ti.metadata['id'] = id
    return ti