def _compute_score(self, raw_tokens, ans_eng): term_count = 0 related = [] causal_match = False position = [] for term, synsets in ans_eng.ir_query_tagged: match = False term_related = [] for i, page_term in enumerate(indexer.regularize(raw_tokens)): page_term_related = ans_eng.related_values(synsets, page_term) if page_term_related: term_related.append((max(page_term_related), i)) if term == page_term or max( page_term_related) >= ans_eng.lch: match = True if match: # above LCH value term_count += 1 if term == 'cause': causal_match = True if term_related: term_related.sort(key=lambda tup: tup[0]) term_related, i = term_related[-1] # maximum value related.append(term_related) position.append(i) return term_count, related, causal_match, position
def regularize_text(self): if not self.paragraphs: self.tokenize_sentences() for i, para in enumerate(self.paragraphs): for j, sent in enumerate(para.sentence_tokens): self.paragraphs[i].sentence_tokens[j] = regularize(sent) # Remove empty sentences self.paragraphs[i].sentence_tokens = [ x for x in self.paragraphs[i].sentence_tokens if x ]
def regularize_text(self): """Regularizes all tokens for each sentence in each paragraph.""" if not self.paragraphs: self.tokenize_sentences() for i, para in enumerate(self.paragraphs): for j, sent in enumerate(para.sentence_tokens): self.paragraphs[i].sentence_tokens[j] = regularize(sent) # Remove empty sentences self.paragraphs[i].sentence_tokens = [x for x in self. paragraphs[i].sentence_tokens if x]
def _compute_score(self, raw_tokens, ans_eng): """Compute various score components from the answer text. Args: raw_tokens: List of token strings, such as from a sentence. ans_eng: The AnswerEngine object that was used to generate this Answer object. Returns: term_count: Integer number of how many query-terms had a matching answer-term with a semantic relatedness (LCH) value above the threshold specified in the ans_eng. related: List of LCH similarity values (semantic relatedness), representing the maximum LCH value for each query-term when evaluated with each answer-term. causal_match: Boolean value representing whether or not the (usually hidden) causal term had a match (LCH greater than the threshold in ans_eng). position: List of token indexes for each query term, representing the location in the raw_tokens that had the maximal semantic relatedness (LCH) for each query term. """ term_count = 0 related = [] causal_match = False position = [] for term, synsets in ans_eng.ir_query_tagged: match = False term_related = [] for i, page_term in enumerate(indexer.regularize(raw_tokens)): page_term_related = ans_eng.related_values(synsets, page_term) if page_term_related: term_related.append((max(page_term_related), i)) if term == page_term or max( page_term_related) >= ans_eng.lch: match = True if match: # above LCH value term_count += 1 if term == 'cause': causal_match = True if term_related: term_related.sort(key=lambda tup: tup[0]) term_related, i = term_related[-1] # maximum value related.append(term_related) position.append(i) return term_count, related, causal_match, position
def _compute_score(self, raw_tokens, ans_eng): """Compute various score components from the answer text. Args: raw_tokens: List of token strings, such as from a sentence. ans_eng: The AnswerEngine object that was used to generate this Answer object. Returns: term_count: Integer number of how many query-terms had a matching answer-term with a semantic relatedness (LCH) value above the threshold specified in the ans_eng. related: List of LCH similarity values (semantic relatedness), representing the maximum LCH value for each query-term when evaluated with each answer-term. causal_match: Boolean value representing whether or not the (usually hidden) causal term had a match (LCH greater than the threshold in ans_eng). position: List of token indexes for each query term, representing the location in the raw_tokens that had the maximal semantic relatedness (LCH) for each query term. """ term_count = 0 related = [] causal_match = False position = [] for term, synsets in ans_eng.ir_query_tagged: match = False term_related = [] for i, page_term in enumerate(indexer.regularize(raw_tokens)): page_term_related = ans_eng.related_values(synsets, page_term) if page_term_related: term_related.append((max(page_term_related), i)) if term == page_term or max(page_term_related) >= ans_eng.lch: match = True if match: # above LCH value term_count += 1 if term == 'cause': causal_match = True if term_related: term_related.sort(key=lambda tup: tup[0]) term_related, i = term_related[-1] # maximum value related.append(term_related) position.append(i) return term_count, related, causal_match, position
def __init__(self, index, query, start=0, num_top=10, lch=2.16): """Inits AnswerEngine by querying the IR module to get Page objects. Args: index: An indexer.Index object, which represent the IR system. query: The direct query string from the user. start: The number of pages to offset from the beginning of the page list returned by the index. num_top: The number of pages (from the top of the ranked list of pages (sorted by similarity) returned by the index) to extract answers from. Combined withe the start-argument, this allows for paging through the results by only looking at a certain number of pages at a time. lch: The Leacock-Chodorow Similarity measurement. Used to determine if two WordNet senses (synsets) are related. The default value has been empirically determined to provide good results, though it may be fine-tuned. This argument should be a float. """ self.query = query self.start = start self.num_top = num_top self.lch = lch self.answers = None # Candidate Document Selection self.ir_query = indexer.regularize(indexer.tokenizer.tokenize(query)) self.ir_query_tagged = None page_sim = index.ranked(self.ir_query) self.num_pages = len(page_sim) # Reduce number of pages we need to get from disk page_sim = page_sim[start:num_top] page_ids, similarity = zip(*page_sim) # Retrieve the Page objects from the list of Page.IDs self.pages = index.get_page(page_ids) # Tell each page the value of its similarity score for page, sim in zip(self.pages, similarity): page.cosine_sim = sim
def __init__(self, index, query, start=0, num_top=10, lch=2.16): self.query = query self.start = start self.num_top = num_top self.lch = lch self.answers = None # Candidate Document Selection self.ir_query = indexer.regularize(indexer.tokenizer.tokenize(query)) self.ir_query_tagged = None page_sim = index.ranked(self.ir_query) self.num_pages = len(page_sim) # Reduce number of pages we need to get from disk page_sim = page_sim[start:num_top] page_ids, similarity = zip(*page_sim) # Retrieve the Page objects from the list of Page.IDs self.pages = index.get_page(page_ids) # Tell each page the value of its similarity score for page, sim in zip(self.pages, similarity): page.cosine_sim = sim