def focus_words(self): # Focus words may come from questions focus_words = [token(w) for i, w in enumerate(self.q_doc) if self.q_focus_mask[i]] # Focus words may also come from answers for aid, a_doc in enumerate(self.a_docs): for i, w in enumerate(a_doc): if self.a_focus_mask[aid][i]: focus_words.append(token(w)) return list(set(focus_words))
def _keep_answer(self, a_doc): effect_w_cnt = 0 for w in a_doc: if not (w.is_stop or w.is_punct or w.is_space or \ token(w) == "yes" or token(w) == "no"): effect_w_cnt += 1 if effect_w_cnt >= KEEP_ANS_MIN_WORD: return True else: logger.debug(u"Filter out answer: {}".format(a_doc)) return False
def _spacy_doc_to_token(self, doc): que_len, tokens, POSs = 0, [], [] delete_idx = [] for _, w in enumerate(doc): t = correct_token(token(w)) if t: tokens.append(t) POSs.append(w.pos_) else: delete_idx.append(_) que_len = len(tokens) assert que_len == len(POSs) return que_len, tokens, POSs, delete_idx
def cntxt_words(self): cntxt_words = [] for aid, a_doc in enumerate(self.a_docs): for i, w in enumerate(a_doc): if self.a_cntxt_mask[aid][i]: cntxt_words.append(token(w)) return cntxt_words
def count_vocab(self, doc): for w in doc: t = correct_token(token(w)) if t: self.vocab_cnt[t] += 1 self.pos_cnt[w.pos_] += 1