def preprocess_instance( self, idd: int, question: QASetting, answers: Optional[List[Answer]] = None) -> MCAnnotation: has_answers = answers is not None q_tokenized, q_ids, q_length, _, _ = preprocessing.nlp_preprocess( question.question, self.shared_resources.vocab, lowercase=self.shared_resources.config.get('lowercase', True)) s_tokenized, s_ids, s_length, _, _ = preprocessing.nlp_preprocess( question.support[0], self.shared_resources.vocab, lowercase=self.shared_resources.config.get('lowercase', True)) return MCAnnotation(question_tokens=q_tokenized, question_ids=q_ids, question_length=q_length, support_tokens=s_tokenized, support_ids=s_ids, support_length=s_length, answer=self.shared_resources.answer_vocab( answers[0].text) if has_answers else 0, id=idd)
def preprocess(self, questions: List[QASetting], answers: Optional[List[List[Answer]]] = None, is_eval: bool = False) -> List[Mapping[str, any]]: preprocessed = list() for i, qa in enumerate(questions): _, token_ids, length, _, _ = preprocessing.nlp_preprocess( qa.question, self.shared_resources.vocab, lowercase=self.shared_resources.config.get('lowercase', True)) _, s_token_ids, s_length, _, _ = preprocessing.nlp_preprocess( qa.support[0], self.shared_resources.vocab, lowercase=self.shared_resources.config.get('lowercase', True)) preprocessed.append({ 'supports': s_token_ids, 'question': token_ids, 'support_lengths': s_length, 'question_lengths': length, 'ids': i, }) if answers is not None: preprocessed[-1][ "answers"] = self.shared_resources.answer_vocab( answers[i][0].text) return preprocessed
def preprocess_instance( self, idd: int, question: QASetting, answers: Optional[List[Answer]] = None) -> Optional[MCAnnotation]: has_answers = answers is not None if self.shared_resources.config.get("use_dep_sa", False): anno = MCAnnotation( question_tokens=question.q_tokenized, question_ids=None, question_length=len(question.q_tokenized), support_tokens=question.s_tokenized, support_ids=None, support_length=len(question.s_tokenized), answer=self.shared_resources.answer_vocab(answers[0].text) if has_answers else 0, id=idd, question_dep_i=question.q_dep_i, question_dep_j=question.q_dep_j, question_dep_type=question.q_dep_type, support_dep_i=question.s_dep_i, support_dep_j=question.s_dep_j, support_dep_type=question.s_dep_type, ) return anno else: q_tokenized, q_ids, q_length, _, _ = preprocessing.nlp_preprocess( question.question, self.shared_resources.vocab, lowercase=self.shared_resources.config.get('lowercase', True)) s_tokenized, s_ids, s_length, _, _ = preprocessing.nlp_preprocess( question.support[0], self.shared_resources.vocab, lowercase=self.shared_resources.config.get('lowercase', True)) return MCAnnotation( question_tokens=q_tokenized, question_ids=q_ids, question_length=q_length, support_tokens=s_tokenized, support_ids=s_ids, support_length=s_length, answer=self.shared_resources.answer_vocab(answers[0].text) if has_answers else 0, id=idd, question_dep_i=None, question_dep_j=None, question_dep_type=None, support_dep_i=None, support_dep_j=None, support_dep_type=None, )
def preprocess(self, questions: List[QASetting], answers: Optional[List[List[Answer]]] = None, is_eval: bool = False) -> List[Mapping[str, any]]: preprocessed = list() for i, qa in enumerate(questions): tokens, _, length, lemmas, _ = preprocessing.nlp_preprocess( qa.question, self.shared_resources.vocab, lowercase=True, with_lemmas=True, use_spacy=True) s_tokens, _, s_length, s_lemmas, _ = preprocessing.nlp_preprocess( qa.support[0], self.shared_resources.vocab, lowercase=True, with_lemmas=True, use_spacy=True) preprocessed.append({ 'support_tokens': s_tokens, 'support_lemmas': s_lemmas, 'support_lengths': s_length, 'question_tokens': tokens, 'question_lemmas': lemmas, 'question_lengths': length, 'ids': i, }) if answers is not None: preprocessed[-1]["answers"] = self.shared_resources.answer_vocab(answers[i][0].text) return preprocessed
def test_vocab(): train_data = [ QASetting(question='A person is training his horse for a competition.', support=['A person on a horse jumps over a broken down airplane.'], candidates=['entailment', 'neutral', 'contradiction']) ] print('build vocab based on train data') train_vocab = preprocessing.fill_vocab(train_data) train_vocab.freeze() pprint(train_vocab._sym2freqs) pprint(train_vocab._sym2id) MIN_VOCAB_FREQ, MAX_VOCAB_CNT = 2, 10 train_vocab = train_vocab.prune(MIN_VOCAB_FREQ, MAX_VOCAB_CNT) pprint(train_vocab._sym2freqs) pprint(train_vocab._sym2id) print('encode train data') train_data = preprocessing.nlp_preprocess(train_data[0].question, train_vocab)[0] print(train_data)
def prepare_data(qa_setting: QASetting, answers: Optional[List[Answer]], vocab: Vocab, lowercase: bool = False, with_answers: bool = False, wiq_contentword: bool = False, spacy_nlp: bool = False, max_support_length: int = None, lemmatize=False, with_lemmas=False) \ -> Tuple[List[str], List[int], Optional[List[int]], int, List[List[str]], List[List[int]], Optional[List[List[int]]], List[int], List[List[float]], List[List[int]], List[List[Tuple[int, int]]]]: """Preprocesses a question and (optionally) answers: The steps include tokenization, lower-casing, translation to IDs, computing the word-in-question feature, computing token offsets, truncating supports, and computing answer spans. """ supports = qa_setting.support question = qa_setting.question question_tokens, question_ids, question_length, question_lemmas, _ = preprocessing.nlp_preprocess( question, vocab, lowercase=lowercase, use_spacy=spacy_nlp, lemmatize=lemmatize, with_lemmas=with_lemmas, with_tokens_offsets=False) question_tokens_set = set(t.lower() for t in question_tokens) preprocessed_supports = [ preprocessing.nlp_preprocess( support, vocab, lowercase=lowercase, use_spacy=spacy_nlp, lemmatize=lemmatize, with_lemmas=with_lemmas, with_tokens_offsets=True) for support in supports] all_support_tokens = [s[0] for s in preprocessed_supports] all_support_ids = [s[1] for s in preprocessed_supports] all_support_length = [s[2] for s in preprocessed_supports] all_support_lemmas = [s[3] for s in preprocessed_supports] all_token_offsets = [s[4] for s in preprocessed_supports] rng = random.Random(12345) all_word_in_question = [] if with_lemmas: assert all_support_lemmas is not None for support_lemmas in all_support_lemmas: all_word_in_question.append([]) if with_lemmas: for lemma in support_lemmas: all_word_in_question[-1].append(float( lemma in question_lemmas and (not wiq_contentword or (lemma.isalnum() and not lemma.is_stop)))) else: for support_tokens in all_support_tokens: all_word_in_question.append([]) for token in support_tokens: all_word_in_question[-1].append( float(token.lower() in question_tokens_set and (not wiq_contentword or token.isalnum()))) all_answer_spans = [] for doc_idx, support_tokens in enumerate(all_support_tokens): min_answer = len(support_tokens) max_answer = 0 token_offsets = all_token_offsets[doc_idx] answer_spans = [] if with_answers: assert isinstance(answers, list) for a in answers: if a.doc_idx != doc_idx: continue start = 0 while start < len(token_offsets) and token_offsets[start] < a.span[0]: start += 1 if start == len(token_offsets): continue end = start while end + 1 < len(token_offsets) and token_offsets[end + 1] < a.span[1]: end += 1 if (start, end) not in answer_spans: answer_spans.append((start, end)) min_answer = min(min_answer, start) max_answer = max(max_answer, end) # cut support whenever there is a maximum allowed length and recompute answer spans support_length = all_support_length[doc_idx] if max_support_length is not None and support_length > max_support_length > 0: if max_answer < max_support_length: # Find new start and end in the flattened support new_start = 0 new_end = max_support_length else: offset = rng.randint(1, 11) new_end = max_answer new_start = max(0, min(min_answer, new_end + 2 * offset - max_support_length)) while new_end - new_start > max_support_length - 2 * offset: answer_spans = [(s, e) for s, e in answer_spans if e < new_end] new_end = max(answer_spans, key=lambda span: span[1])[1] new_start = max(0, min(min_answer, new_end + 2 * offset - max_support_length)) new_end = min(new_end + offset, support_length) new_start = max(new_start - offset, 0) # Crop support according to new start and end pointers all_support_tokens[doc_idx] = support_tokens[new_start:new_end] all_support_ids[doc_idx] = all_support_ids[doc_idx][new_start:new_end] if with_lemmas: all_support_lemmas[doc_idx] = all_support_lemmas[doc_idx][new_start:new_end] answer_spans = [(s - new_start, e - new_start) for s, e in answer_spans] all_word_in_question[doc_idx] = all_word_in_question[doc_idx][new_start:new_end] all_support_length[doc_idx] = new_end - new_start all_token_offsets[doc_idx] = token_offsets[new_start:new_end] all_answer_spans.append(answer_spans) return question_tokens, question_ids, question_lemmas, question_length, \ all_support_tokens, all_support_ids, all_support_lemmas, all_support_length, \ all_word_in_question, all_token_offsets, all_answer_spans
def prepare_data(qa_setting: QASetting, answers: Optional[List[Answer]], vocab: Vocab, lowercase: bool = False, with_answers: bool = False, wiq_contentword: bool = False, spacy_nlp: bool = False, max_support_length: int = -1, lemmatize=False, with_lemmas=False) \ -> Tuple[List[str], List[int], Optional[List[int]], int, List[str], List[int], Optional[List[int]], int, List[float], List[int], List[Tuple[int, int]]]: """Preprocesses a question and (optionally) answers: The steps include tokenization, lower-casing, translation to IDs, computing the word-in-question feature, computing token offsets, truncating supports, and computing answer spans. """ support = " ".join(qa_setting.support) question = qa_setting.question question_tokens, question_ids, question_length, question_lemmas, _ = preprocessing.nlp_preprocess( question, vocab, lowercase=lowercase, use_spacy=spacy_nlp, lemmatize=lemmatize, with_lemmas=with_lemmas, with_tokens_offsets=False) support_tokens, support_ids, support_length, support_lemmas, token_offsets = preprocessing.nlp_preprocess( support, vocab, lowercase=lowercase, use_spacy=spacy_nlp, lemmatize=lemmatize, with_lemmas=with_lemmas, with_tokens_offsets=True) rng = random.Random(12345) word_in_question = [] if with_lemmas: assert support_lemmas is not None for lemma in support_lemmas: word_in_question.append(float(lemma in question_lemmas and (not wiq_contentword or (lemma.isalnum() and not lemma.is_stop)))) else: for token in support_tokens: word_in_question.append(float(token in question_tokens and (not wiq_contentword or token.isalnum()))) min_answer = len(support_tokens) max_answer = 0 answer_spans = [] if with_answers: assert isinstance(answers, list) for a in answers: start = 0 while start < len(token_offsets) and token_offsets[start] < a.span[0]: start += 1 if start == len(token_offsets): continue end = start while end + 1 < len(token_offsets) and token_offsets[end + 1] < a.span[1]: end += 1 if (start, end) not in answer_spans: answer_spans.append((start, end)) min_answer = min(min_answer, start) max_answer = max(max_answer, end) # cut support whenever there is a maximum allowed length and recompute answer spans if max_support_length is not None and len(support_tokens) > max_support_length > 0: support_length = max_support_length if max_answer < max_support_length: support_tokens = support_tokens[:max_support_length] support_ids = support_ids[:max_support_length] if with_lemmas: support_lemmas = support_lemmas[:max_support_length] word_in_question = word_in_question[:max_support_length] else: offset = rng.randint(1, 11) new_end = max_answer + offset new_start = max(0, min(min_answer - offset, new_end - max_support_length)) while new_end - new_start > max_support_length: answer_spans = [(s, e) for s, e in answer_spans if e < (new_end - offset)] new_end = max(answer_spans, key=lambda span: span[1])[1] + offset new_start = max(0, min(min_answer - offset, new_end - max_support_length)) support_tokens = support_tokens[new_start:new_end] support_ids = support_ids[new_start:new_end] if with_lemmas: support_lemmas = support_lemmas[new_start:new_end] answer_spans = [(s - new_start, e - new_start) for s, e in answer_spans] word_in_question = word_in_question[new_start:new_end] return question_tokens, question_ids, question_lemmas, question_length, \ support_tokens, support_ids, support_lemmas, support_length, \ word_in_question, token_offsets, answer_spans