def read_items(self): questions_path = self.file_path('questions.tsv.gz') items = dict() for line in self.read_tsv(questions_path, is_gzip=True): id = line[0] title = unidecode(line[1]) if len(line) > 2: body = unidecode(line[2]) else: self.logger.info('Question has no body! {}'.format(id)) body = title answer = unidecode(line[3]).strip() if len(line) > 3 else None duplicates = line[4].split(',') if len(line) > 4 else [] if self.lowercased: title = title.lower() body = body.lower() answer = answer.lower() if answer else None ti = TextItem(title) ti.metadata['id'] = '{}_title'.format(id) ti.metadata['duplicates'] = duplicates items[ti.metadata['id']] = ti ti = TextItem(body) ti.metadata['id'] = '{}_body'.format(id) items[ti.metadata['id']] = ti if answer: ti = TextItem(answer) ti.metadata['id'] = '{}_answer'.format(id) items[ti.metadata['id']] = ti return items
def _get_text_item(text, id): question_tokens = [Token(t) for t in text.split()] question_sentence = Sentence(' '.join([t.text for t in question_tokens]), question_tokens) ti = TextItem(question_sentence.text, [question_sentence]) ti.metadata['id'] = id return ti
def read(self): answers = OrderedDict() with open(path.join(self.archive_path, 'document_passages.json'), 'r') as f: for document_id, passages in json.loads(f.read()).items(): for passage_id, passage_text in passages.items(): answer_ti = TextItem(passage_text.lower() if self.lowercased else passage_text) answer_ti.metadata['id'] = 'answer-{}-{}'.format(document_id, passage_id) answers['{}_{}'.format(document_id, passage_id)] = answer_ti train = self.read_split('train', answers) valid = self.read_split('dev', answers) test = self.read_split('test', answers) questions = [qa.question for qa in (train.qa + valid.qa + test.qa)] answers = train.answers + valid.answers + test.answers return Archive(self.name, train, valid, [test], questions, answers)
def read_split(self, name, answers): datapoints = [] split_answers = [] with open(path.join(self.archive_path, '{}.tsv'.format(name)), 'r') as f: next(f) for l in f: qid, question, doc_id, _, relevant_passages = l.strip().split('\t') question_ti = TextItem(question.lower() if self.lowercased else question) question_ti.metadata['id'] = 'question-{}'.format(qid) pool = [a for (k, a) in answers.items() if k.startswith('{}_'.format(doc_id))] np.random.shuffle(pool) ground_truth = [answers[doc_id + '_' + a_id] for a_id in relevant_passages.split(',')] datapoints.append(QAPool(question_ti, pool, ground_truth)) split_answers += pool return Data('wikipassageqa / {}'.format(name), datapoints, split_answers)
def get_text_item(text): """Converts a text into a tokenized text item :param text: :return: """ if config['data']['lowercased']: text = text.lower() question_tokens = [Token(t) for t in word_tokenize(text)] question_sentence = Sentence(' '.join([t.text for t in question_tokens]), question_tokens) return TextItem(question_sentence.text, [question_sentence])
def read_items(self): questions_path = self.file_path('questions.tsv.gz') items = dict() for line in self.read_tsv(questions_path, is_gzip=True): id = line[0] title = unidecode(line[1]) if len(line) > 2: body = unidecode(line[2]) else: self.logger.info('Question has no body! {}'.format(id)) body = title if self.lowercased: title = title.lower() body = body.lower() # text = '{} {}'.format(title, body) # tokens = [Token(t) for t in text.split()] # ti = TextItem(text, tokens) # ti.metadata['id'] = id # items[id] = ti ti = TextItem(title) ti.metadata['id'] = '{}_title'.format(id) items[ti.metadata['id']] = ti ti = TextItem(body) ti.metadata['id'] = '{}_body'.format(id) items[ti.metadata['id']] = ti return items
def _get_text_item(text, id): ti = TextItem(text) ti.metadata['id'] = id return ti
def convert_input(tokens_all, vocab): # input convert tokens_all = [Token(token) for token in tokens_all if token in vocab] sent_ti = TextItem(' '.join(t.text for t in tokens_all), tokens_all) return sent_ti