def preprocess(self, docs: List[Document], evidence, name='train'): out = [] for doc in docs: for para_ix, para in enumerate(doc.paragraphs): for q in para.questions: if q.answer: ans = q.answer.answer_spans else: ans = np.zeros((0, 2), dtype=np.int32) if self.text_process: text, ans, inv = self.text_process.encode_paragraph( q.words, [flatten_iterable(para.text)], para.paragraph_num == 0, ans, para.spans) new_para = SquadParagraphWithAnswers( text, ans, doc.doc_id, para_ix, para.original_text, inv) else: new_para = SquadParagraphWithAnswers( flatten_iterable(para.text), ans, doc.doc_id, para_ix, para.original_text, para.spans) if name == "train": out.append( WeightedMultiParagraphQuestion( q.question_id, q.words, q.answer.answer_text, [new_para], q.weight)) else: out.append( MultiParagraphQuestion(q.question_id, q.words, q.answer.answer_text, [new_para])) return out
def ranked_questions(self, docs: List[Document]) -> List[MultiParagraphQuestion]: out = [] for doc in docs: scores = self.rank(flatten_iterable([q.words for q in x.questions] for x in doc.paragraphs), [x.text for x in doc.paragraphs]) q_ix = 0 for para_ix, para in enumerate(doc.paragraphs): for q in para.questions: para_scores = scores[q_ix] para_ranks = np.argsort(para_scores) selection = [i for i in para_ranks[:self.n_to_select]] if self.force_answer and para_ix not in selection: selection[-1] = para_ix para = [] for ix in selection: #if ix == para_ix: if ix == para_ix and q.answer: ans = q.answer.answer_spans else: ans = np.zeros((0, 2), dtype=np.int32) p = doc.paragraphs[ix] if self.text_process: text, ans, inv = self.text_process.encode_paragraph(q.words, [flatten_iterable(p.text)], p.paragraph_num == 0, ans, p.spans) para.append(SquadParagraphWithAnswers(text, ans, doc.doc_id, ix, p.original_text, inv)) else: para.append(SquadParagraphWithAnswers(flatten_iterable(p.text), ans, doc.doc_id, ix, p.original_text, p.spans)) out.append(MultiParagraphQuestion(q.question_id, q.words, q.answer.answer_text, para)) q_ix += 1 return out
def show_web_paragraphs(): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = TopTfIdf(stop, 6) stop_words = stop.words corpus = TriviaQaWebDataset() train = corpus.get_train() points = flatten_iterable([(q, d) for d in q.all_docs] for q in train) np.random.shuffle(points) for q, d in points: q_words = {strip_accents_unicode(w.lower()) for w in q.question} q_words = {x for x in q_words if x not in stop_words} doc = corpus.evidence.get_document(d.doc_id) doc = splitter.split_annotated(doc, d.answer_spans) ranked = ranker.dists(q.question, doc) if len(ranked) < 2 or len(ranked[1][0].answer_spans) == 0: continue print(" ".join(q.question)) print(q.answer.all_answers) for i, (para, dist) in enumerate(ranked[0:2]): text = flatten_iterable(para.text) print("Start=%d, Rank=%d, Dist=%.4f" % (para.start, i, dist)) if len(para.answer_spans) == 0: continue for s, e in para.answer_spans: text[s] = bcolors.CYAN + text[s] text[e] = text[e] + bcolors.ENDC for i, w in enumerate(text): if strip_accents_unicode(w.lower()) in q_words: text[i] = bcolors.ERROR + text[i] + bcolors.ENDC print(" ".join(text)) input()
def extract_paragraph(text: List[List[List[str]]], start, end) -> List[List[str]]: out = [] on_token = 0 on_para = [] for para in text: for sent in para: exected_len = max(on_token - start, 0) if (sum(len(s) for s in out) + len(on_para)) != exected_len: raise ValueError() if on_token + len(sent) <= start: on_token += len(sent) continue if (on_token + len(sent)) >= end: on_para += sent[:end - on_token] out.append(on_para) if len(flatten_iterable(out)) != end - start: raise ValueError(len(flatten_iterable(out)), end - start) return out if on_token + len(sent) < start: pass on_para += sent on_token += len(sent) if len(on_para) > 0: out.append(on_para) on_para = [] out.append(on_para) if len(flatten_iterable(out)) != end - start: raise ValueError(len(flatten_iterable(out)), end-start) return out
def split(self, doc: List[List[List[str]]]): out = [] on_token = 0 for para in doc: flattened_para = flatten_iterable(para) end = on_token + len(flattened_para) out.append(ExtractedParagraph([flatten_iterable(para)], on_token, end)) on_token = end return out
def test_splitter(splitter: DocumentSplitter, n_sample, n_answer_spans, seed=None): rng = np.random.RandomState(seed) corpus = TriviaQaEvidenceCorpusTxt() docs = sorted(corpus.list_documents()) rng.shuffle(docs) max_tokens = splitter.max_tokens read_n = splitter.reads_first_n for doc in docs[:n_sample]: text = corpus.get_document(doc, read_n) fake_answers = [] offset = 0 for para in text: flattened = flatten_iterable(para) fake_answer_starts = np.random.choice(len(flattened), min( len(flattened) // 2, np.random.randint(5)), replace=False) max_answer_lens = np.minimum( len(flattened) - fake_answer_starts, 30) fake_answer_ends = fake_answer_starts + np.floor( rng.uniform() * max_answer_lens).astype(np.int32) fake_answers.append( np.concatenate([ np.expand_dims(fake_answer_starts, 1), np.expand_dims(fake_answer_ends, 1) ], axis=1) + offset) offset += len(flattened) fake_answers = np.concatenate(fake_answers, axis=0) flattened = flatten_iterable(flatten_iterable(text)) answer_strs = set(tuple(flattened[s:e + 1]) for s, e in fake_answers) paragraphs = splitter.split_annotated(text, fake_answers) for para in paragraphs: text = flatten_iterable(para.text) if max_tokens is not None and len(text) > max_tokens: raise ValueError( "Paragraph len len %d, but max tokens was %d" % (len(text), max_tokens)) start, end = para.start, para.end if text != flattened[start:end]: raise ValueError( "Paragraph is missing text, given bounds were %d-%d" % (start, end)) for s, e in para.answer_spans: if tuple(text[s:e + 1]) not in answer_strs: print(s, e) raise ValueError( "Incorrect answer for paragraph %d-%d (%s)" % (start, end, " ".join(text[s:e + 1])))
def run_evaluators(self, sess: tf.Session, dataset: Dataset, name, n_sample=None, feed_dict=None) -> Evaluation: all_tensors_needed = list( set(flatten_iterable(x.values() for x in self.tensors_needed))) tensors = {x: [] for x in all_tensors_needed} if n_sample is None: batches, n_batches = dataset.get_epoch(), len(dataset) else: batches, n_batches = dataset.get_samples(n_sample) data_used = [] for batch in tqdm(batches, total=n_batches, desc=name, ncols=80): feed_dict = self.model.encode(batch, is_train=False) output = sess.run(all_tensors_needed, feed_dict=feed_dict) data_used += batch for i in range(len(all_tensors_needed)): tensors[all_tensors_needed[i]].append(output[i]) # flatten the input for k in all_tensors_needed: v = tensors[k] if len(k.shape) == 0: v = np.array(v) # List of scalars elif any(x is None for x in k.shape.as_list()): # Variable sized tensors, so convert to flat python-list v = flatten_iterable(v) else: v = np.concatenate(v, axis=0) # concat along the batch dim tensors[k] = v percent_filtered = dataset.percent_filtered() if percent_filtered is None: true_len = len(data_used) else: true_len = len(data_used) * 1 / (1 - percent_filtered) combined = None for ev, needed in zip(self.evaluators, self.tensors_needed): args = {k: tensors[v] for k, v in needed.items()} evaluation = ev.evaluate(data_used, true_len, **args) if evaluation is None: raise ValueError(ev) if combined is None: combined = evaluation else: combined.add(evaluation) return combined
def split(self, doc: List[List[List[str]]]) -> List[ExtractedParagraph]: words = flatten_iterable(flatten_iterable(doc)) on_word = 0 out = [] while True: end_word = on_word + np.random.randint(1, 7) if on_word + end_word > len(words): out.append(ExtractedParagraph([words[on_word:]], on_word, len(words))) return out out.append(ExtractedParagraph([words[on_word:end_word]], on_word, end_word)) on_word = end_word
def read_input_data(model): data = [] vocab = set() tokenizer = NltkAndPunctTokenizer() splitter = Truncate(400) # NOTE: we truncate past 400 tokens selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) with open(OPTS.input_file) as f: for i, line in enumerate(f): try: document_raw, question_raw = line.strip().split('\t') except ValueError as e: print(line.strip()) print('Error at line %d' % i) raise e document = re.split("\s*\n\s*", document_raw) question = tokenizer.tokenize_paragraph_flat(question_raw) doc_toks = [tokenizer.tokenize_paragraph(p) for p in document] split_doc = splitter.split(doc_toks) context = selector.prune(question, split_doc) if model.preprocessor is not None: context = [model.preprocessor.encode_text(question, x) for x in context] else: context = [flatten_iterable(x.text) for x in context] vocab.update(question) for txt in context: vocab.update(txt) ex = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] data.append((document_raw, question_raw, context, ex)) return data, vocab
def main(): data = SquadCorpus() string_f1 = 0 mapped_string_f1 = 0 docs = data.get_train() n_questions = 0 for doc in tqdm(docs): for para in doc.paragraphs: words = flatten_iterable(para.text) for question in para.questions: n_questions += 1 span_answer = question.answer[0] span_str = " ".join( words[span_answer. para_word_start:span_answer.para_word_end + 1]) raw_answer = span_answer.text mapped_str = para.get_original_text( span_answer.para_word_start, span_answer.para_word_end) string_f1 += f1_score(raw_answer, span_str) mapped_string_f1 += f1_score(raw_answer, mapped_str) print(string_f1 / n_questions) print(mapped_string_f1 / n_questions)
def main(): data = TriviaQaWebDataset() stop = NltkPlusStopWords() splitter = MergeParagraphs(400) selector = TopTfIdf(stop, 4) print("Loading data..") train = data.get_train() print("Start") for q in train: for doc in q.all_docs: if len(doc.answer_spans) > 3: text = splitter.split_annotated( data.evidence.get_document(doc.doc_id), doc.answer_spans) text = selector.prune(q.question, text) for para in text: if len(para.answer_spans) > 3: print(q.question) text = flatten_iterable(para.text) for s, e in para.answer_spans: text[s] = "{{{" + text[s] text[e] = text[e] + "}}}" print(" ".join(text)) input()
def any_found(self, para): # Normalize the paragraph words = [w.lower().strip(self.strip) for w in flatten_iterable(para)] occurances = [] for answer_ix, answer in enumerate(self.answer_tokens): # Locations where the first word occurs word_starts = [i for i, w in enumerate(words) if answer[0] == w] n_tokens = len(answer) # Advance forward until we find all the words, skipping over articles for start in word_starts: end = start + 1 ans_token = 1 while ans_token < n_tokens and end < len(words): next = words[end] if answer[ans_token] == next: ans_token += 1 end += 1 elif next in self.skip: end += 1 else: break if n_tokens == ans_token: occurances.append((start, end)) return list(set(occurances))
def build_tokenized_corpus(input_root, tokenizer, output_dir, skip_dirs=False, n_processes=1, wiki_only=False): if not exists(output_dir): makedirs(output_dir) all_files = _gather_files(input_root, output_dir, skip_dirs, wiki_only) if n_processes == 1: voc = build_tokenized_files(tqdm(all_files, ncols=80), input_root, output_dir, tokenizer) else: voc = set() from multiprocessing import Pool with Pool(n_processes) as pool: chunks = split(all_files, n_processes) chunks = flatten_iterable(group(c, 500) for c in chunks) pbar = tqdm(total=len(chunks), ncols=80) for v in pool.imap_unordered( _build_tokenized_files_t, [[c, input_root, output_dir, tokenizer] for c in chunks]): voc.update(v) pbar.update(1) pbar.close() voc_file = join(output_dir, "vocab.txt") with open(voc_file, "w") as f: for word in sorted(voc): f.write(word) f.write("\n")
def print_questions(question, answers, context, answer_span): print(" ".join(question)) print(answers) context = flatten_iterable(context) for s, e in answer_span: context[s] = "{{{" + context[s] context[e] = context[e] + "}}}" print(" ".join(context))
def post_split_tokens(tokens: List[str]) -> List[str]: """ Apply a small amount of extra splitting to the given tokens, this is in particular to avoid UNK tokens due to contraction, quotation, or other forms of puncutation. I haven't really done tests to see if/how much difference this makes, but it does avoid some common UNKs I noticed in SQuAD/TriviaQA """ return flatten_iterable([x for x in extra_split_chars_re.split(token) if x != ""] for token in tokens)
def preprocess(self, questions: List[TriviaQaQuestion], evidence) -> object: splitter = self.splitter para_filter = self.ranker with_paragraphs = [] for q in questions: paras = [] for doc in q.all_docs: if self.require_an_answer and len(doc.answer_spans) == 0: continue text = evidence.get_document(doc.doc_id, splitter.reads_first_n) split = splitter.split_annotated(text, doc.answer_spans) paras.extend([ DocParagraphWithAnswers(x.text, x.start, x.end, x.answer_spans, doc.doc_id) for x in split ]) if para_filter is not None: paras = para_filter.prune(q.question, paras) if len(paras) == 0: continue if self.require_an_answer: if all(len(x.answer_spans) == 0 for x in paras): continue if self.text_preprocess is not None: prepped = [ self.text_preprocess.encode_extracted_paragraph( q.question, p) for p in paras ] if self.require_an_answer: if all(len(x.answer_spans) == 0 for x in prepped): continue doc_paras = [] for i, (preprocessed, para) in enumerate(zip(prepped, paras)): doc_paras.append( DocumentParagraph(para.doc_id, para.start, para.end, i, preprocessed.answer_spans, preprocessed.text)) with_paragraphs.append( MultiParagraphQuestion(q.question_id, q.question, q.answer.all_answers, doc_paras)) else: doc_paras = [ DocumentParagraph(x.doc_id, x.start, x.end, i, x.answer_spans, flatten_iterable(x.text)) for i, x in enumerate(paras) ] with_paragraphs.append( MultiParagraphQuestion(q.question_id, q.question, q.answer.all_answers, doc_paras)) return FilteredData(with_paragraphs, len(questions))
def print_paragraph(question: TriviaQaQuestion, para: ExtractedParagraphWithAnswers): print(" ".join(question.question)) print(question.answer.all_answers) context = flatten_iterable(para.text) for s, e in para.answer_spans: context[s] = "{{{" + context[s] context[e] = context[e] + "}}}" print(" ".join(context))
def find_answer(documents, raw_question): raw_question = raw_question.lower() documents = [d.lower() for d in documents] global best_spans, conf documents = [re.split("\s*\n\s*", doc) for doc in documents] tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(raw_question) documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] splitter = MergeParagraphs(400) documents = [splitter.split(doc) for doc in documents] if len(documents) == 1: selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) context = [flatten_iterable(x.text) for x in context] data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] encoded = model.encode(data, is_train=False) with sess.as_default(): spans, confid = sess.run([best_spans, conf], feed_dict=encoded) best_para = np.argmax(confid) ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] + 1]) confidence = confid[best_para] return ans, confidence
def test_segment_log_sum_exp(self): sess = self.sess with sess.as_default(): for i in range(10): groups = [] for group_id in range(10): group = [] for _ in range(np.random.randint(1, 5)): group.append(np.random.normal(0, 2, 10)) groups.append(group) flat_groups = np.stack(flatten_iterable(groups), axis=0) semgents = np.array( flatten_iterable([ix] * len(g) for ix, g in enumerate(groups))) actual = sess.run(segment_logsumexp(flat_groups, semgents)) expected = [ np.log(np.sum(np.exp(np.concatenate(g, axis=0)))) for g in groups ] self.assertTrue(np.allclose(actual, expected))
def compute_answer_spans_par(questions: List[TriviaQaQuestion], corpus, tokenizer, detector, n_processes: int): if n_processes == 1: word_tokenize = tokenizer.tokenize_paragraph_flat compute_answer_spans(questions, corpus, word_tokenize, detector) return questions from multiprocessing import Pool with Pool(n_processes) as p: chunks = split(questions, n_processes) questions = flatten_iterable( p.starmap(_compute_answer_spans_chunk, [[c, corpus, tokenizer, detector] for c in chunks])) return questions
def test_split_inv(self): paras = [ "One fish two fish. Red fish blue fish", "Just one sentence", "How will an overhead score? The satisfactory juice returns against an inviting protein. " "How can a rat expand? The subway fishes throughout a struggle. The guaranteed herd pictures an " "episode into the accustomed damned. The garbage reigns beside the component!", ] tok = NltkAndPunctTokenizer() tokenized = [tok.tokenize_with_inverse(x) for x in paras] inv_split = RandomSplitter().split_inverse(tokenized) for para in inv_split: self.assertTrue(flatten_iterable(para.text) == [para.original_text[s:e] for s,e in para.spans])
def parse_race_data(source, name, tokenizer, use_tqdm=True) -> List[Document]: with open(source, 'r') as f: source_data = json.load(f) if use_tqdm: iter_files = tqdm(source_data['data'], ncols=80) else: iter_files = source_data['data'] mult_options_dict = {'A':0,'B':1,'C':2,'D':3} mult_anwsers_array = [] for article_ix, article in enumerate(iter_files): article_ix = "%s-%d" % (name, article_ix) paragraphs = [] for para_ix, para in enumerate(article['paragraphs']): questions = [] #pdb.set_trace() context = para['context'] tokenized = tokenizer.tokenize_with_inverse(context) # list of sentences + mapping from words -> original text index text, text_spans = tokenized.text, tokenized.spans flat_text = flatten_iterable(text) n_words = sum(len(sentence) for sentence in text) for question_ix, question in enumerate(para['qas']): # There are actually some multi-sentence questions, so we should have used # tokenizer.tokenize_paragraph_flat here which would have produced slighy better # results in a few cases. However all the results we report were # done using `tokenize_sentence` so I am just going to leave this way question_text = tokenizer.tokenize_sentence(question['question']) #vz #pdb.set_trace() choices_text = [tokenizer.tokenize_sentence(x) for x in question['choices']] mult_answer = question['answer'] mult_answer_ix = mult_options_dict[mult_answer] #old calc of spans from 'build_squad_dataset' mult_anwsers_array.append(mult_answer_ix) questions.append(Question(question['id'], question_text, mult_answer_ix,choices_text)) paragraphs.append(Paragraph(text, questions, article_ix, para_ix, context,mult_anwsers_array)) #vz we need to add here choices as well (like questions or similar) yield Document(article_ix, article["title"], paragraphs)
def get_document(self, doc_id, n_tokens=None, flat=False): if self.file_id_map is None: file_id = doc_id else: file_id = self.file_id_map.get(doc_id) if file_id is None: return None file_id = join(self.directory, file_id + ".txt") if not exists(file_id): return None with open(file_id, "r") as f: if n_tokens is None: text = f.read() if flat: return [ x for x in self._split_all.split(text) if len(x) > 0 ] else: paragraphs = [] for para in self._split_para.split(text): paragraphs.append( [sent.split(" ") for sent in para.split("\n")]) return paragraphs else: paragraphs = [] paragraph = [] cur_tokens = 0 for line in f: if line == "\n": if not flat and len(paragraph) > 0: paragraphs.append(paragraph) paragraph = [] else: sent = line.split(" ") sent[-1] = sent[-1].rstrip() if len(sent) + cur_tokens > n_tokens: if n_tokens != cur_tokens: paragraph.append(sent[:n_tokens - cur_tokens]) break else: paragraph.append(sent) cur_tokens += len(sent) if flat: return flatten_iterable(paragraph) else: if len(paragraph) > 0: paragraphs.append(paragraph) return paragraphs
def contains_question_word(): data = TriviaQaWebDataset() stop = NltkPlusStopWords(punctuation=True).words doc_filter = ContainsQuestionWord(NltkPlusStopWords(punctuation=True)) splits = MergeParagraphs(400) # splits = Truncate(400) questions = data.get_dev() pairs = flatten_iterable([(q, doc) for doc in q.all_docs] for q in questions) pairs.sort(key=lambda x: (x[0].question_id, x[1].doc_id)) np.random.RandomState(0).shuffle(questions) has_token = 0 total = 0 used = Counter() for q, doc in tqdm(pairs[:1000]): text = data.evidence.get_document(doc.doc_id, splits.reads_first_n) q_tokens = set(x.lower() for x in q.question) q_tokens -= stop for para in splits.split_annotated(text, doc.answer_spans): # if para.start == 0: # continue if len(para.answer_spans) == 0: continue if any(x.lower() in q_tokens for x in flatten_iterable(para.text)): has_token += 1 for x in flatten_iterable(para.text): if x in q_tokens: used[x] += 1 # else: # print_questions(q.question, q.answer.all_answers, para.text, para.answer_spans) # input() total += 1 for k, v in used.most_common(200): print("%s: %d" % (k, v)) print(has_token / total)
def split(self, doc: List[List[List[str]]]): all_paragraphs = [] on_doc_token = 0 # the word in the document the current paragraph starts at on_paragraph = [] # text we have collect for the current paragraph cur_tokens = 0 # number of tokens in the current paragraph word_ix = 0 for para in doc: para = flatten_iterable(para) n_words = len(para) if self.top_n is not None and (word_ix + self.top_n) > self.top_n: if word_ix == self.top_n: break para = para[:self.top_n - word_ix] n_words = self.top_n - word_ix start_token = word_ix end_token = start_token + n_words word_ix = end_token if cur_tokens + n_words > self.max_tokens: if cur_tokens != 0: # end the current paragraph all_paragraphs.append( ExtractedParagraph(on_paragraph, on_doc_token, start_token)) on_paragraph = [] cur_tokens = 0 if n_words >= self.max_tokens: # either truncate the given paragraph, or begin a new paragraph text = para[:self.max_tokens] all_paragraphs.append( ExtractedParagraph([text], start_token, start_token + self.max_tokens)) on_doc_token = end_token else: on_doc_token = start_token on_paragraph.append(para) cur_tokens = n_words else: on_paragraph.append(para) cur_tokens += n_words if len(on_paragraph) > 0: all_paragraphs.append( ExtractedParagraph(on_paragraph, on_doc_token, word_ix)) return all_paragraphs
def encode(self, batch: List[ContextAndQuestion], is_train: bool, cached_doc=None): if len(batch) > self.max_batch_size: raise ValueError( "The model can only use a batch <= %d, but got %d" % (self.max_batch_size, len(batch))) data = self.encoder.encode(batch, is_train) data[self. _question_char_ids_placeholder] = self._batcher.batch_sentences( [q.question for q in batch]) data[self._is_train_placeholder] = is_train if cached_doc is not None: data[self._cached_doc_placeholder] = cached_doc context_word_dim = cached_doc.shape[1] else: data[self._cached_doc_placeholder] = np.zeros( (1, 1, self.document_embedding_dim)) context_word_dim = data[self.encoder.context_words].shape[1] if not self.per_sentence: data[self._context_char_ids_placeholder] = \ self._batcher.batch_sentences([x.get_context() for x in batch]) else: data[self._context_char_ids_placeholder] = \ self._batcher.batch_sentences(flatten_iterable([x.sentences for x in batch])) # Compute indices where context_sentence_ixs[sentence#, k, sentence_word#] = (batch#, k, batch_word#) # for each word. We use this to map the tokens built for the sentences back to # the format where sentences are flattened for each batch context_sentence_ixs = np.zeros( (len(batch), 3, context_word_dim, 3), dtype=np.int32) total_sent_ix = 0 for ix, point in enumerate(batch): word_ix = 0 for sent_ix, sent in enumerate(point.sentences): for w_ix in range(len(sent)): for k in range(3): context_sentence_ixs[ix, k, word_ix] = [ total_sent_ix, k, w_ix ] word_ix += 1 total_sent_ix += 1 data[self._context_sentence_ixs] = context_sentence_ixs return data
def preprocess_par(questions: List, evidence, preprocessor, n_processes=2, chunk_size=200, name=None): if chunk_size <= 0: raise ValueError("Chunk size must be >= 0, but got %s" % chunk_size) if n_processes is not None and n_processes <= 0: raise ValueError("n_processes must be >= 1 or None, but got %s" % n_processes) n_processes = min(len(questions), n_processes) if n_processes == 1: out = preprocessor.preprocess(tqdm(questions, desc=name, ncols=80), evidence) preprocessor.finalize_chunk(out) return out else: from multiprocessing import Pool chunks = split(questions, n_processes) chunks = flatten_iterable([group(c, chunk_size) for c in chunks]) print("Processing %d chunks with %d processes" % (len(chunks), n_processes)) pbar = tqdm(total=len(questions), desc=name, ncols=80) lock = Lock() def call_back(results): preprocessor.finalize_chunk(results[0]) with lock: # FIXME Even with the lock, the progress bar still is jumping around pbar.update(results[1]) with Pool(n_processes) as pool: results = [ pool.apply_async(_preprocess_and_count, [c, evidence, preprocessor], callback=call_back) for c in chunks ] results = [r.get()[0] for r in results] pbar.close() output = results[0] for r in results[1:]: output += r return output
def get_evidence_voc(corpus, n_processes=1): doc_ids = corpus.list_documents() voc = Counter() if n_processes == 1: for doc in tqdm(doc_ids): voc = corpus.get_document(doc, flat=True) else: from multiprocessing import Pool chunks = split(doc_ids, n_processes) chunks = flatten_iterable(group(x, 10000) for x in chunks) pbar = tqdm(total=len(chunks), ncols=80) with Pool(n_processes) as pool: for v in pool.imap_unordered(_extract_voc_tuple, [[corpus, c] for c in chunks]): voc += v pbar.update(1) pbar.close() return voc
def any_found(self, para): words = [x.lower() for x in flatten_iterable(para)] occurances = [] for answer_ix, answer in enumerate(self.answer_tokens): word_starts = [i for i, w in enumerate(words) if answer[0] == w] n_tokens = len(answer) for start in word_starts: end = start + 1 ans_token = 1 while ans_token < n_tokens and end < len(words): next = words[end] if answer[ans_token] == next: ans_token += 1 end += 1 else: break if n_tokens == ans_token: occurances.append((start, end)) return list(set(occurances))
def check_preprocess(): data = TriviaQaWebDataset() merge = MergeParagraphs(400) questions = data.get_dev() pre = WithIndicators(False) remove_cross = WithIndicators(True) rng = np.random.RandomState(0) rng.shuffle(questions) for q in tqdm(questions[:1000]): doc = rng.choice(q.all_docs, 1)[0] text = data.evidence.get_document(doc.doc_id, n_tokens=800) paras = merge.split_annotated(text, doc.answer_spans) para = paras[np.random.randint(0, len(paras))] built = pre.encode_extracted_paragraph(q.question, para) expected_text = flatten_iterable(para.text) if expected_text != [ x for x in built.text if x not in pre.special_tokens() ]: raise ValueError() expected = [expected_text[s:e + 1] for s, e in para.answer_spans] expected = Counter([tuple(x) for x in expected]) actual = [tuple(built.text[s:e + 1]) for s, e in built.answer_spans] actual_cleaned = Counter( tuple(z for z in x if z not in pre.special_tokens()) for x in actual) if actual_cleaned != expected: raise ValueError() r_built = remove_cross.encode_extracted_paragraph(q.question, para) rc = Counter( tuple(r_built.text[s:e + 1]) for s, e in r_built.answer_spans) removed = Counter() for w in actual: if all(x not in pre.special_tokens() for x in w): removed[w] += 1 if rc != removed: raise ValueError()