def make_utterance_markup(self, doc, utt_index, translation, budget, stem=False): score_key = translation + (".stem_match" if stem else ".exact_match") match_style = "rel_close_match" if stem else "rel_exact_match" utt = doc.utterances[utt_index]["translations"][translation] token_scores = np.array( doc.annotations[score_key]["annotation"][utt_index]\ ["word"]["matches"] ) assert token_scores.ndim == 2 and token_scores.shape[1] == 1 token_scores = token_scores.ravel() line_items = [] for t, token in enumerate(utt.tokens): if token_scores[t] > 0: line_items.append( '<span_class="[STYLE]">{}</span>'.format(token.word) ) else: line_items.append(token.word) line = detokenize(" ".join(line_items)) wc = len(line.split()) if wc > budget: wc = budget line = " ".join(line.split()[:wc]) + "..." line = re.sub(r"span_class", "span class", line) line = re.sub(r"\[STYLE\]", match_style, line) return "<p>{}</p>\n".format(line), wc
def make_utterance_markup(self, utt, budget, exact_matches, close_matches): line_items = [] matches = [] for token in utt.tokens: if token in exact_matches: line_items.append('<span_class="[EXACTREL]">' + token.word + '</span>') matches.append(1) elif token in close_matches: line_items.append('<span_class="[REL]">' + token.word + '</span>') matches.append(1) else: line_items.append(token.word) matches.append(0) l, r = None, None new_line_items = line_items if len(line_items) > budget: match_inds = [i for i in range(len(matches)) if matches[i] == 1] if len(match_inds) > 0: l = match_inds[0] r = match_inds[-1] if r - l + 1 > budget: new_line_items = line_items l, r = None, None else: new_line_items = line_items[l:r + 1] while len(new_line_items) < budget: if l > 0: l = l - 1 new_line_items.insert(0, line_items[l]) if len(new_line_items) == budget: break if r < len(line_items) - 1: r = r + 1 new_line_items.append(line_items[r]) line = detokenize(" ".join(new_line_items)) wc = len(line.split()) if wc > budget: wc = budget line = " ".join(line.split()[:wc]) + "..." if l is not None and l > 0: line = '...' + line if r is not None and r < len(line_items): if not line.endswith('...'): line = line + '...' line = re.sub(r"span_class", "span class", line) line = re.sub(r"\[EXACTREL\]", "rel_exact_match", line) line = re.sub(r"\[REL\]", "rel_close_match", line) return "<p>{}</p>\n".format(line), wc
def make_utterance_markup(self, doc, utt_index, translation, budget, query, stem=False): score_key = translation + (".stem_match" if stem else ".exact_match") match_style = "rel_close_match" if stem else "rel_exact_match" rel_style = "rel_close" if stem else "rel_exact" utt = doc.utterances[utt_index]["translations"][translation] raw_token_scores = np.array( doc.annotations[score_key]["annotation"][utt_index]\ ["word"]["matches"] ) query_tokens = [x.word for x in query.content.tokens if x.word.lower() not in en_stopwords] assert len(query_tokens) == raw_token_scores.shape[1] found_words = set() for qword, qscore in zip(query_tokens, raw_token_scores.sum(axis=0)): if qscore > 0: found_words.add(qword) token_scores = raw_token_scores.sum(axis=1) token_scores = token_scores.ravel() mark_sims = set() if any(raw_token_scores.sum(axis=0) == 0): sim_scores = np.array(doc.annotations[translation + ".glove42Bsim.content_semcons"]["annotation"][utt_index]["word"]["sims"]).ravel() for top_sim in np.argsort(sim_scores)[::-1]: if token_scores[top_sim] == 0: mark_sims.add(top_sim) if len(mark_sims) >= 2: break line_items = [] for t, token in enumerate(utt.tokens): if token_scores[t] > 0: line_items.append( '<span_class="[STYLE]">{}</span>'.format(token.word) ) elif t in mark_sims: line_items.append( '<span_class="[REL]">{}</span>'.format(token.word) ) else: line_items.append(token.word) line = detokenize(" ".join(line_items)) wc = len(line.split()) if wc > budget: wc = budget line = " ".join(line.split()[:wc]) + "..." line = re.sub(r"span_class", "span class", line) line = re.sub(r"\[STYLE\]", match_style, line) line = re.sub(r"\[REL\]", rel_style, line) return "<p>{}</p>\n".format(line), wc
def make_utterance_markup(self, utt, budget, exact_matches, close_matches): line_items = [] for token in utt.tokens: if token in exact_matches: line_items.append('<span_class="[EXACTREL]">' + token.word + '</span>') elif token in close_matches: line_items.append('<span_class="[REL]">' + token.word + '</span>') else: line_items.append(token.word) line = detokenize(" ".join(line_items)) wc = len(line.split()) if wc > budget: wc = budget line = " ".join(line.split()[:wc]) + "..." line = re.sub(r"span_class", "span class", line) line = re.sub(r"\[EXACTREL\]", "rel_exact_match", line) line = re.sub(r"\[REL\]", "relevant", line) return "<p>{}</p>\n".format(line), wc
def __call__(self, doc, budget=100, make_header=True): query = doc.annotations["QUERY"] # print(query) # print([x.word for x in query.content.tokens]) best_translations = [] scores = [] for i in range(len(doc.utterances)): trans, trans_score = self.get_best_translation(doc.annotations, i) best_translations.append(trans) scores.append(trans_score) scores = np.stack(scores).T ranks = np.argsort(scores, axis=1)[:, ::-1] merged_ranks = merge_rankings(ranks) markup_lines = [] if make_header: header, header_size = make_relevant_header(query) markup_lines.append(header) size = header_size else: size = 0 ranked_utterances = [] for i in merged_ranks: best_trans = best_translations[i] utt = doc.utterances[i]["translations"][best_trans] num_words = len( detokenize(" ".join([x.word for x in utt.tokens])).split()) size += num_words ranked_utterances.append({ "index": i, "utt": utt, "trans": best_trans }) if size > budget: break ranked_utterances.sort(key=lambda x: x["index"]) size = header_size if make_header else 0 exact_matches = set() close_matches = set() t2s = {} for x in ranked_utterances: tokens = x["utt"].tokens x_matches = doc.annotations[x["trans"] + ".exact_match"]["annotation"][ x['index']]["word"]["matches"] c_matches = doc.annotations[ x["trans"] + ".glove42Bsim.content_semcons"]["annotation"][ x['index']]["word"]["sims"] for t, m, sim in zip(tokens, x_matches, c_matches): if np.sum(m) > 0: exact_matches.add(t) if t.pos in ["NN", "VB"]: t2s[t] = sim[0] sim_toks = sorted(t2s, key=lambda x: t2s[x], reverse=True) for t in sim_toks: if t in exact_matches: continue close_matches.add(t) if len(close_matches) > 5: break for x in ranked_utterances: line, wc = self.make_utterance_markup(x["utt"], budget - size, exact_matches, close_matches) size += wc markup_lines.append(line) if size >= budget: break found_terms = set([t.word.lower() for t in exact_matches]) missing_terms = set([ t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms ]) instr = get_instructions(query.string, found_terms, missing_terms) return "\n".join(markup_lines), instr
def __call__(self, doc, budget=100): query = doc.annotations["QUERY"] best_translations = self.get_best_translations(doc) scores = self.get_scores(doc) I = np.argsort(scores)[::-1] if scores[I[0]] == 0: return ConceptV2(*self.default_args, **self.default_kwargs)(doc, budget=budget) if self.header: header = "Match found for {}, check number/tense/meaning:".format( " ".join([t.word for t in query.content.tokens])) size = len(header.split()) markup_lines = ["<h1>{}</h1>".format(header)] else: markup_lines = [] size = 0 meta = { "translation": [], "markup": "morphv1", "utterance_ids": [], "source_offsets": [], "mode": doc.mode, "source_md5": doc.md5 } for idx in I: score = scores[idx] if score == 0: break trans = best_translations[idx] sent = doc.utterances[idx]["translations"][trans] src_utt = doc.utterances[idx]["source"] tokens = [token.word for token in sent.tokens] mname = self.translation_annotators[trans][0][0] for m in doc.annotations[mname]["annotation"][idx]: for j, s in enumerate(m["match_quality"], m["token_position"]): if s >= 1: tokens[j] = '<span_class="RELEXACTMATCH">' \ + tokens[j] + '</span>' else: tokens[j] = '<span_class="RELEXACT">' \ + tokens[j] + '</span>' line = detokenize(" ".join(tokens)) wc = len(line.split()) if wc + size > budget: wc = budget - size line = " ".join(line.split()[:wc]) + "..." size += wc line = line.replace("RELEXACTMATCH", "rel_exact_match") line = line.replace("RELEXACT", "rel_close_match") line = line.replace("span_class", "span class") markup_lines.append("<p>{}</p>".format(line)) meta["translation"].append(trans) meta["utterance_ids"].append(int(idx)) meta["source_offsets"].append(src_utt.offsets) if size >= budget: break found_terms = self.get_found_words(doc, best_translations, query) missing_terms = [t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms \ and t.word.lower() not in en_stopwords] # instructions = get_instructions(query.string, found_terms, missing_terms) return "\n".join(markup_lines), instructions, meta # if query.morphological_constraint.morph.pos != "NN": # return "<p>SKIP</p>" pos = "np" matches = [] match_qualities = [] for i, utt in enumerate(doc): tr2ann = {} tr2score = {} for trans in translations: k = trans + ".morph_match_" + pos ann = doc.annotations[k]["annotations"][i] if len(ann) > 0: tr2ann[trans] = ann tr2score[trans] = max([x["match_score"] for x in ann]) if len(tr2ann) == 0: continue srt_trans = sorted(tr2ann.keys(), key=lambda x: tr2score[x], reverse=True) if len(srt_trans) > 1 \ and tr2score[srt_trans[0]] == tr2score[srt_trans[1]]: if "nmt" in srt_trans[0]: best_trans = srt_trans[0] else: best_trans = srt_trans[1] else: best_trans = srt_trans[0] for ann in tr2ann[best_trans]: match_qualities.append([x >= 1 for x in ann["match_quality"]]) matches.append({ "sent": i, "trans": best_trans, "anns": tr2ann[best_trans], "score": tr2score[best_trans], "exact_morph": any([x["exact_morph"] for x in tr2ann[best_trans]]) }) # sort is stable sorting should put exact matches first by higest score # then soft matches, by highest score matches.sort(key=lambda x: x["score"], reverse=True) matches.sort(key=lambda x: x["exact_morph"], reverse=True) if len(match_qualities) == 0: result = ConceptV2(*self.default_args, **self.default_kwargs)(doc, budget=budget) result[2]["markup"] = "morph-backoff-conceptv2" return result found_term_ind = np.array(match_qualities).sum(axis=0) found_terms = [ q.word for q, ind in zip(query.content.tokens, found_term_ind) if ind ] markup_lines = [] header, size = make_word_match_header(query, found_terms) markup_lines.append(header) for match in matches: sent = doc.utterances[match["sent"]]["translations"][ match["trans"]] tokens = [token.word for token in sent.tokens] for m in match["anns"]: for j, s in enumerate(m["match_quality"], m["token_position"]): if s >= 1: tokens[j] = '<span_class="RELEXACTMATCH">' \ + tokens[j] + '</span>' else: tokens[j] = '<span_class="RELEXACT">' \ + tokens[j] + '</span>' line = detokenize(" ".join(tokens)) wc = len(line.split()) if wc + size > budget: wc = budget - size line = " ".join(line.split()[:wc]) + "..." size += wc line = line.replace("RELEXACTMATCH", "rel_exact_match") line = line.replace("RELEXACT", "rel_exact") line = line.replace("span_class", "span class") markup_lines.append("<p>{}</p>".format(line)) if size >= budget: break missing_terms = [ t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms ] instructions = get_instructions(query.string, found_terms, missing_terms) return "\n".join(markup_lines), instructions