def __call__(self, doc, budget=100, make_header=True): query = doc.annotations["QUERY"] best_translations = self.get_best_translations(doc) score_dict = { annotator_key(ann): self.get_scores(doc, ann) for ann in self.annotators if doc.annotations[ann[0]] is not None } keys = sorted(score_dict.keys()) scores = [score_dict[key] for key in keys] ordered_indices, points = merge_scores(scores, return_points=True) if self.header: header, size = make_relevant_header(query) markup_lines = [header] else: markup_lines = [] size = 0 meta = { "translation": [], "markup": "conceptv2", "utterance_ids": [], "source_offsets": [], "mode": doc.mode, "source_md5": doc.md5 } for idx in ordered_indices: trans = best_translations[idx] utt = doc.utterances[idx]["translations"][trans] src_utt = doc.utterances[idx]["source"] exact_matches = self.get_exact_matches(doc, idx, trans) stem_matches = self.get_stem_matches(doc, idx, trans) stem_matches = stem_matches - exact_matches soft_matches = self.get_soft_matches(doc, idx, trans) close_matches = stem_matches | soft_matches line, wc = self.make_utterance_markup(utt, budget - size, exact_matches, close_matches) size += wc markup_lines.append(line) meta["translation"].append(trans) meta["utterance_ids"].append(int(idx)) meta["source_offsets"].append(src_utt.offsets) if size >= budget: break markup = "\n".join(markup_lines) found_terms = self.get_found_words(doc, best_translations, query) missing_terms = [t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms and \ t.word.lower() not in en_stopwords] instr = get_instructions(query.string, found_terms, missing_terms) return markup, instr, meta
def __call__(self, doc, budget=100): query = doc.annotations["QUERY"] exact_match_sentids = set() stem_match_sentids = set() exact_matches = [] stem_matches = [] for i, utt in enumerate(doc): sent_ann = { k: doc.annotations[k]["annotation"][i] for k in doc.annotations.keys() if k != "QUERY" if doc.annotations[k] is not None } best_exact_trans = self.find_best_translation(sent_ann) exact_ann = best_exact_trans + ".exact_match" exact_score = doc.annotations[exact_ann]["annotation"][i]\ ["sentence"]["sum"] best_stem_trans = self.find_best_translation(sent_ann, stem=True) stem_ann = best_stem_trans + ".stem_match" stem_score = doc.annotations[stem_ann]["annotation"][i]\ ["sentence"]["sum"] if exact_score > 0: exact_matches.append((i, exact_score, best_exact_trans)) exact_match_sentids.add(i) elif stem_score > 0: stem_matches.append((i, stem_score, best_stem_trans)) stem_match_sentids.add(i) exact_matches = sorted(exact_matches, key=lambda x: x[1], reverse=True) stem_matches = sorted(stem_matches, key=lambda x: x[1], reverse=True) markup_lines = [] size = 0 if len(exact_matches) > 0 or len(stem_matches) > 0: header_line, wc = make_word_match_header( query, [x.word for x in query.content.tokens]) size += wc markup_lines.append(header_line) if len(exact_matches) > 0: # header_line, wc = make_word_match_header( # query, [x.word for x in query.content.tokens]) # size += wc # markup_lines.append(header_line) for i, score, best_trans in exact_matches: line, wc = self.make_utterance_markup(doc, i, best_trans, budget - size) markup_lines.append(line) size += wc if size >= budget: break # header_line, wc = make_match_header(query, stem=True) if len(stem_matches) > 0 and size < budget: # size += wc # markup_lines.append(header_line) for i, score, best_trans in stem_matches: line, wc = self.make_utterance_markup(doc, i, best_trans, budget - size, stem=True) markup_lines.append(line) size += wc if size >= budget: break if len(markup_lines) > 0: if len(exact_matches) > 0: instructions = get_instructions( query.string, [query.content.tokens[0].word], []) else: instructions = get_instructions( query.string, [], [query.content.tokens[0].word]) return "\n".join(markup_lines), instructions else: return ConceptV1()(doc, budget=budget)
def __call__(self, doc, budget=100, make_header=True): query = doc.annotations["QUERY"] # print(query) # print([x.word for x in query.content.tokens]) best_translations = [] scores = [] for i in range(len(doc.utterances)): trans, trans_score = self.get_best_translation(doc.annotations, i) best_translations.append(trans) scores.append(trans_score) scores = np.stack(scores).T ranks = np.argsort(scores, axis=1)[:, ::-1] merged_ranks = merge_rankings(ranks) markup_lines = [] if make_header: header, header_size = make_relevant_header(query) markup_lines.append(header) size = header_size else: size = 0 ranked_utterances = [] for i in merged_ranks: best_trans = best_translations[i] utt = doc.utterances[i]["translations"][best_trans] num_words = len( detokenize(" ".join([x.word for x in utt.tokens])).split()) size += num_words ranked_utterances.append({ "index": i, "utt": utt, "trans": best_trans }) if size > budget: break ranked_utterances.sort(key=lambda x: x["index"]) size = header_size if make_header else 0 exact_matches = set() close_matches = set() t2s = {} for x in ranked_utterances: tokens = x["utt"].tokens x_matches = doc.annotations[x["trans"] + ".exact_match"]["annotation"][ x['index']]["word"]["matches"] c_matches = doc.annotations[ x["trans"] + ".glove42Bsim.content_semcons"]["annotation"][ x['index']]["word"]["sims"] for t, m, sim in zip(tokens, x_matches, c_matches): if np.sum(m) > 0: exact_matches.add(t) if t.pos in ["NN", "VB"]: t2s[t] = sim[0] sim_toks = sorted(t2s, key=lambda x: t2s[x], reverse=True) for t in sim_toks: if t in exact_matches: continue close_matches.add(t) if len(close_matches) > 5: break for x in ranked_utterances: line, wc = self.make_utterance_markup(x["utt"], budget - size, exact_matches, close_matches) size += wc markup_lines.append(line) if size >= budget: break found_terms = set([t.word.lower() for t in exact_matches]) missing_terms = set([ t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms ]) instr = get_instructions(query.string, found_terms, missing_terms) return "\n".join(markup_lines), instr
def __call__(self, doc, budget=100): query = doc.annotations["QUERY"] exact_match_sentids = set() stem_match_sentids = set() exact_matches = [] stem_matches = [] query_terms_found = [] for i, utt in enumerate(doc): sent_ann = { k: doc.annotations[k]["annotation"][i] for k in doc.annotations.keys() if k != "QUERY" and doc.annotations[k] is not None } best_exact_trans = self.find_best_translation(sent_ann) exact_ann = best_exact_trans + ".exact_match" exact_score = doc.annotations[exact_ann]["annotation"][i]\ ["sentence"]["sum"] query_terms_found.append( np.array( doc.annotations[exact_ann]["annotation"][i]\ ["word"]["matches"] ).sum(axis=0) ) best_stem_trans = self.find_best_translation(sent_ann, stem=True) stem_ann = best_stem_trans + ".stem_match" stem_score = doc.annotations[stem_ann]["annotation"][i]\ ["sentence"]["sum"] query_terms_found.append( np.array( doc.annotations[stem_ann]["annotation"][i]\ ["word"]["matches"] ).sum(axis=0) ) if exact_score > 0: exact_matches.append((i, exact_score, best_exact_trans)) exact_match_sentids.add(i) elif stem_score > 0: stem_matches.append((i, stem_score, best_stem_trans)) stem_match_sentids.add(i) query_terms_found = np.stack(query_terms_found).sum(axis=0) > 0 stopped_terms = [x.word for x in query.content.tokens if x.word.lower() not in en_stopwords] found_terms = [x for x, f in zip(stopped_terms, query_terms_found) if f > 0] exact_matches = sorted(exact_matches, key=lambda x: x[1], reverse=True) stem_matches = sorted(stem_matches, key=lambda x: x[1], reverse=True) markup_lines = [] size = 0 if len(found_terms) > 0: header_line, wc = make_word_match_header(query, found_terms) size += wc markup_lines.append(header_line) if len(exact_matches) > 0: for i, score, best_trans in exact_matches: line, wc = self.make_utterance_markup(doc, i, best_trans, budget - size, query) markup_lines.append(line) size += wc if size >= budget: break if len(stem_matches) > 0 and size < budget: for i, score, best_trans in stem_matches: line, wc = self.make_utterance_markup(doc, i, best_trans, budget - size, query, stem=True) markup_lines.append(line) size += wc if size >= budget: break if len(markup_lines) > 0: missing_terms = [t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms and \ t.word.lower() not in en_stopwords] instr = get_instructions(query.string, found_terms, missing_terms) return "\n".join(markup_lines), instr else: return ConceptV1()(doc, budget=budget)
def __call__(self, doc, budget=100): query = doc.annotations["QUERY"] best_translations = self.get_best_translations(doc) score_dict = { annotator_key(ann): self.get_scores(doc, ann) for ann in self.annotators if doc.annotations[ann[0]] is not None } keys = sorted(score_dict.keys()) scores = [score_dict[key] for key in keys] ordered_indices, points = merge_scores(scores, return_points=True) scores = np.array(scores).sum(axis=0) if all(scores <= 0): result = ConceptV2(*self.default_args, **self.default_kwargs)(doc, budget=budget) result[2]["markup"] = "lexicalv2-backoff-conceptv2" return result markup_lines = [] found_terms = self.get_found_words(doc, best_translations, query) if self.header: header_line, size = make_word_match_header(query, found_terms) markup_lines.append(header_line) else: size = 0 meta = { "translation": [], "markup": "lexicalv2", "utterance_ids": [], "source_offsets": [], "mode": doc.mode, "source_md5": doc.md5 } for idx in ordered_indices: if scores[idx] == 0: break trans = best_translations[idx] utt = doc.utterances[idx]["translations"][trans] src_utt = doc.utterances[idx]["source"] exact_matches = self.get_exact_matches(doc, idx, trans) stem_matches = self.get_stem_matches(doc, idx, trans) close_matches = stem_matches - exact_matches if self.cutoffs is not None: max_length = min(self.cutoffs[doc.mode][trans], budget - size) else: max_length = budget - size line, wc = self.make_utterance_markup(utt, max_length, exact_matches, close_matches) size += wc markup_lines.append(line) meta["translation"].append(trans) meta["utterance_ids"].append(int(idx)) meta["source_offsets"].append(src_utt.offsets) if size >= budget: break markup = "\n".join(markup_lines) missing_terms = [t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms and \ t.word.lower() not in en_stopwords] instr = get_instructions(query.string, found_terms, missing_terms) return markup, instr, meta
def __call__(self, doc, budget=100): query = doc.annotations["QUERY"] best_translations = self.get_best_translations(doc) scores = self.get_scores(doc) I = np.argsort(scores)[::-1] if scores[I[0]] == 0: return ConceptV2(*self.default_args, **self.default_kwargs)(doc, budget=budget) if self.header: header = "Match found for {}, check number/tense/meaning:".format( " ".join([t.word for t in query.content.tokens])) size = len(header.split()) markup_lines = ["<h1>{}</h1>".format(header)] else: markup_lines = [] size = 0 meta = { "translation": [], "markup": "morphv1", "utterance_ids": [], "source_offsets": [], "mode": doc.mode, "source_md5": doc.md5 } for idx in I: score = scores[idx] if score == 0: break trans = best_translations[idx] sent = doc.utterances[idx]["translations"][trans] src_utt = doc.utterances[idx]["source"] tokens = [token.word for token in sent.tokens] mname = self.translation_annotators[trans][0][0] for m in doc.annotations[mname]["annotation"][idx]: for j, s in enumerate(m["match_quality"], m["token_position"]): if s >= 1: tokens[j] = '<span_class="RELEXACTMATCH">' \ + tokens[j] + '</span>' else: tokens[j] = '<span_class="RELEXACT">' \ + tokens[j] + '</span>' line = detokenize(" ".join(tokens)) wc = len(line.split()) if wc + size > budget: wc = budget - size line = " ".join(line.split()[:wc]) + "..." size += wc line = line.replace("RELEXACTMATCH", "rel_exact_match") line = line.replace("RELEXACT", "rel_close_match") line = line.replace("span_class", "span class") markup_lines.append("<p>{}</p>".format(line)) meta["translation"].append(trans) meta["utterance_ids"].append(int(idx)) meta["source_offsets"].append(src_utt.offsets) if size >= budget: break found_terms = self.get_found_words(doc, best_translations, query) missing_terms = [t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms \ and t.word.lower() not in en_stopwords] # instructions = get_instructions(query.string, found_terms, missing_terms) return "\n".join(markup_lines), instructions, meta # if query.morphological_constraint.morph.pos != "NN": # return "<p>SKIP</p>" pos = "np" matches = [] match_qualities = [] for i, utt in enumerate(doc): tr2ann = {} tr2score = {} for trans in translations: k = trans + ".morph_match_" + pos ann = doc.annotations[k]["annotations"][i] if len(ann) > 0: tr2ann[trans] = ann tr2score[trans] = max([x["match_score"] for x in ann]) if len(tr2ann) == 0: continue srt_trans = sorted(tr2ann.keys(), key=lambda x: tr2score[x], reverse=True) if len(srt_trans) > 1 \ and tr2score[srt_trans[0]] == tr2score[srt_trans[1]]: if "nmt" in srt_trans[0]: best_trans = srt_trans[0] else: best_trans = srt_trans[1] else: best_trans = srt_trans[0] for ann in tr2ann[best_trans]: match_qualities.append([x >= 1 for x in ann["match_quality"]]) matches.append({ "sent": i, "trans": best_trans, "anns": tr2ann[best_trans], "score": tr2score[best_trans], "exact_morph": any([x["exact_morph"] for x in tr2ann[best_trans]]) }) # sort is stable sorting should put exact matches first by higest score # then soft matches, by highest score matches.sort(key=lambda x: x["score"], reverse=True) matches.sort(key=lambda x: x["exact_morph"], reverse=True) if len(match_qualities) == 0: result = ConceptV2(*self.default_args, **self.default_kwargs)(doc, budget=budget) result[2]["markup"] = "morph-backoff-conceptv2" return result found_term_ind = np.array(match_qualities).sum(axis=0) found_terms = [ q.word for q, ind in zip(query.content.tokens, found_term_ind) if ind ] markup_lines = [] header, size = make_word_match_header(query, found_terms) markup_lines.append(header) for match in matches: sent = doc.utterances[match["sent"]]["translations"][ match["trans"]] tokens = [token.word for token in sent.tokens] for m in match["anns"]: for j, s in enumerate(m["match_quality"], m["token_position"]): if s >= 1: tokens[j] = '<span_class="RELEXACTMATCH">' \ + tokens[j] + '</span>' else: tokens[j] = '<span_class="RELEXACT">' \ + tokens[j] + '</span>' line = detokenize(" ".join(tokens)) wc = len(line.split()) if wc + size > budget: wc = budget - size line = " ".join(line.split()[:wc]) + "..." size += wc line = line.replace("RELEXACTMATCH", "rel_exact_match") line = line.replace("RELEXACT", "rel_exact") line = line.replace("span_class", "span class") markup_lines.append("<p>{}</p>".format(line)) if size >= budget: break missing_terms = [ t.word.lower() for t in query.content.tokens if t.word.lower() not in found_terms ] instructions = get_instructions(query.string, found_terms, missing_terms) return "\n".join(markup_lines), instructions