コード例 #1
0
    def __call__(self, doc, budget=100, make_header=True):
        query = doc.annotations["QUERY"]

        best_translations = self.get_best_translations(doc)

        score_dict = {
            annotator_key(ann): self.get_scores(doc, ann)
            for ann in self.annotators if doc.annotations[ann[0]] is not None
        }

        keys = sorted(score_dict.keys())
        scores = [score_dict[key] for key in keys]
        ordered_indices, points = merge_scores(scores, return_points=True)

        if self.header:
            header, size = make_relevant_header(query)
            markup_lines = [header]
        else:
            markup_lines = []
            size = 0

        meta = {
            "translation": [],
            "markup": "conceptv2",
            "utterance_ids": [],
            "source_offsets": [],
            "mode": doc.mode,
            "source_md5": doc.md5
        }
        for idx in ordered_indices:
            trans = best_translations[idx]
            utt = doc.utterances[idx]["translations"][trans]
            src_utt = doc.utterances[idx]["source"]

            exact_matches = self.get_exact_matches(doc, idx, trans)
            stem_matches = self.get_stem_matches(doc, idx, trans)
            stem_matches = stem_matches - exact_matches
            soft_matches = self.get_soft_matches(doc, idx, trans)
            close_matches = stem_matches | soft_matches

            line, wc = self.make_utterance_markup(utt, budget - size,
                                                  exact_matches, close_matches)

            size += wc
            markup_lines.append(line)
            meta["translation"].append(trans)
            meta["utterance_ids"].append(int(idx))
            meta["source_offsets"].append(src_utt.offsets)
            if size >= budget:
                break

        markup = "\n".join(markup_lines)

        found_terms = self.get_found_words(doc, best_translations, query)
        missing_terms = [t.word.lower() for t in query.content.tokens
                         if t.word.lower() not in found_terms and \
                            t.word.lower() not in en_stopwords]
        instr = get_instructions(query.string, found_terms, missing_terms)

        return markup, instr, meta
コード例 #2
0
    def __call__(self, doc, budget=100):
        query = doc.annotations["QUERY"]

        exact_match_sentids = set()
        stem_match_sentids = set()
        exact_matches = []
        stem_matches = []

        for i, utt in enumerate(doc):
            sent_ann = {
                k: doc.annotations[k]["annotation"][i]
                for k in doc.annotations.keys() 
                if k != "QUERY" if doc.annotations[k] is not None
            }
            
            best_exact_trans = self.find_best_translation(sent_ann)
            exact_ann = best_exact_trans + ".exact_match"
            exact_score = doc.annotations[exact_ann]["annotation"][i]\
                ["sentence"]["sum"]

            best_stem_trans = self.find_best_translation(sent_ann, stem=True)
            stem_ann = best_stem_trans + ".stem_match"
            stem_score = doc.annotations[stem_ann]["annotation"][i]\
                ["sentence"]["sum"]

            if exact_score > 0:
                exact_matches.append((i, exact_score, best_exact_trans))
                exact_match_sentids.add(i)
                
            elif stem_score > 0:
                stem_matches.append((i, stem_score, best_stem_trans))
                stem_match_sentids.add(i)

        exact_matches = sorted(exact_matches, key=lambda x: x[1], reverse=True)
        stem_matches = sorted(stem_matches, key=lambda x: x[1], reverse=True)
        markup_lines = []

        size = 0

        if len(exact_matches) > 0 or len(stem_matches) > 0:
            
            header_line, wc = make_word_match_header(
                query, [x.word for x in query.content.tokens])
            size += wc
            markup_lines.append(header_line) 

        if len(exact_matches) > 0:
#            header_line, wc = make_word_match_header(
#                query, [x.word for x in query.content.tokens])
#            size += wc
#            markup_lines.append(header_line)
           
            for i, score, best_trans in exact_matches:
                line, wc = self.make_utterance_markup(doc, i, best_trans, 
                                                      budget - size)
                markup_lines.append(line)
                size += wc
                if size >= budget:
                    break

#        header_line, wc = make_match_header(query, stem=True)
        if len(stem_matches) > 0 and size < budget:
#            size += wc
#            markup_lines.append(header_line)
            
            for i, score, best_trans in stem_matches:
                line, wc = self.make_utterance_markup(doc, i, best_trans,
                                                      budget - size, stem=True)
                markup_lines.append(line)
                size += wc
                if size >= budget:
                    break

        if len(markup_lines) > 0:
            if len(exact_matches) > 0:
                instructions = get_instructions(
                    query.string, [query.content.tokens[0].word], [])
            else:
                instructions = get_instructions(
                    query.string, [], [query.content.tokens[0].word])

            return "\n".join(markup_lines), instructions
        else:
            return ConceptV1()(doc, budget=budget)
コード例 #3
0
    def __call__(self, doc, budget=100, make_header=True):
        query = doc.annotations["QUERY"]
        #        print(query)
        #        print([x.word for x in query.content.tokens])

        best_translations = []
        scores = []
        for i in range(len(doc.utterances)):

            trans, trans_score = self.get_best_translation(doc.annotations, i)
            best_translations.append(trans)
            scores.append(trans_score)

        scores = np.stack(scores).T
        ranks = np.argsort(scores, axis=1)[:, ::-1]
        merged_ranks = merge_rankings(ranks)

        markup_lines = []
        if make_header:
            header, header_size = make_relevant_header(query)
            markup_lines.append(header)
            size = header_size
        else:
            size = 0

        ranked_utterances = []
        for i in merged_ranks:
            best_trans = best_translations[i]
            utt = doc.utterances[i]["translations"][best_trans]
            num_words = len(
                detokenize(" ".join([x.word for x in utt.tokens])).split())
            size += num_words
            ranked_utterances.append({
                "index": i,
                "utt": utt,
                "trans": best_trans
            })
            if size > budget:
                break
        ranked_utterances.sort(key=lambda x: x["index"])
        size = header_size if make_header else 0

        exact_matches = set()
        close_matches = set()
        t2s = {}
        for x in ranked_utterances:
            tokens = x["utt"].tokens
            x_matches = doc.annotations[x["trans"] +
                                        ".exact_match"]["annotation"][
                                            x['index']]["word"]["matches"]
            c_matches = doc.annotations[
                x["trans"] + ".glove42Bsim.content_semcons"]["annotation"][
                    x['index']]["word"]["sims"]
            for t, m, sim in zip(tokens, x_matches, c_matches):
                if np.sum(m) > 0:
                    exact_matches.add(t)
                if t.pos in ["NN", "VB"]:
                    t2s[t] = sim[0]
        sim_toks = sorted(t2s, key=lambda x: t2s[x], reverse=True)
        for t in sim_toks:
            if t in exact_matches: continue
            close_matches.add(t)
            if len(close_matches) > 5:
                break

        for x in ranked_utterances:
            line, wc = self.make_utterance_markup(x["utt"], budget - size,
                                                  exact_matches, close_matches)
            size += wc
            markup_lines.append(line)
            if size >= budget:
                break

        found_terms = set([t.word.lower() for t in exact_matches])
        missing_terms = set([
            t.word.lower() for t in query.content.tokens
            if t.word.lower() not in found_terms
        ])

        instr = get_instructions(query.string, found_terms, missing_terms)
        return "\n".join(markup_lines), instr
コード例 #4
0
ファイル: lexicalmultiwordv1.py プロジェクト: eturcan/scripts
    def __call__(self, doc, budget=100):
        query = doc.annotations["QUERY"]

        exact_match_sentids = set()
        stem_match_sentids = set()
        exact_matches = []
        stem_matches = []

        query_terms_found = []

        for i, utt in enumerate(doc):
            sent_ann = {
                k: doc.annotations[k]["annotation"][i]
                for k in doc.annotations.keys() 
                if k != "QUERY" and doc.annotations[k] is not None
            }
            
            best_exact_trans = self.find_best_translation(sent_ann)
            exact_ann = best_exact_trans + ".exact_match"
            exact_score = doc.annotations[exact_ann]["annotation"][i]\
                ["sentence"]["sum"]
            query_terms_found.append(
                np.array(
                    doc.annotations[exact_ann]["annotation"][i]\
                        ["word"]["matches"]
                ).sum(axis=0)
            )


            best_stem_trans = self.find_best_translation(sent_ann, stem=True)
            stem_ann = best_stem_trans + ".stem_match"
            stem_score = doc.annotations[stem_ann]["annotation"][i]\
                ["sentence"]["sum"]
            query_terms_found.append(
                np.array(
                    doc.annotations[stem_ann]["annotation"][i]\
                        ["word"]["matches"]
                ).sum(axis=0)
            )

            if exact_score > 0:
                exact_matches.append((i, exact_score, best_exact_trans))
                exact_match_sentids.add(i)
                
            elif stem_score > 0:
                stem_matches.append((i, stem_score, best_stem_trans))
                stem_match_sentids.add(i)

        query_terms_found = np.stack(query_terms_found).sum(axis=0) > 0
        stopped_terms = [x.word for x in query.content.tokens 
                         if x.word.lower() not in en_stopwords]
        found_terms = [x for x, f in zip(stopped_terms, query_terms_found)
                       if f > 0]

        exact_matches = sorted(exact_matches, key=lambda x: x[1], reverse=True)
        stem_matches = sorted(stem_matches, key=lambda x: x[1], reverse=True)
        markup_lines = []


        size = 0
        if len(found_terms) > 0:
            header_line, wc = make_word_match_header(query, found_terms)
            size += wc
            markup_lines.append(header_line)
            
        if len(exact_matches) > 0:
           
            for i, score, best_trans in exact_matches:
                line, wc = self.make_utterance_markup(doc, i, best_trans, 
                                                      budget - size, query)
                markup_lines.append(line)
                size += wc
                if size >= budget:
                    break

        if len(stem_matches) > 0 and size < budget:
            
            for i, score, best_trans in stem_matches:
                line, wc = self.make_utterance_markup(doc, i, best_trans,
                                                      budget - size, query,
                                                      stem=True)
                markup_lines.append(line)
                size += wc
                if size >= budget:
                    break

        if len(markup_lines) > 0:
            missing_terms = [t.word.lower() for t in query.content.tokens
                             if t.word.lower() not in found_terms and \
                                t.word.lower() not in en_stopwords]
            instr = get_instructions(query.string, found_terms, missing_terms)
            return "\n".join(markup_lines), instr
        else:
            return ConceptV1()(doc, budget=budget)
コード例 #5
0
    def __call__(self, doc, budget=100):
        query = doc.annotations["QUERY"]

        best_translations = self.get_best_translations(doc)

        score_dict = {
            annotator_key(ann): self.get_scores(doc, ann)
            for ann in self.annotators if doc.annotations[ann[0]] is not None
        }

        keys = sorted(score_dict.keys())
        scores = [score_dict[key] for key in keys]
        ordered_indices, points = merge_scores(scores, return_points=True)
        scores = np.array(scores).sum(axis=0)

        if all(scores <= 0):
            result = ConceptV2(*self.default_args,
                               **self.default_kwargs)(doc, budget=budget)
            result[2]["markup"] = "lexicalv2-backoff-conceptv2"
            return result

        markup_lines = []
        found_terms = self.get_found_words(doc, best_translations, query)
        if self.header:
            header_line, size = make_word_match_header(query, found_terms)
            markup_lines.append(header_line)
        else:
            size = 0
        meta = {
            "translation": [],
            "markup": "lexicalv2",
            "utterance_ids": [],
            "source_offsets": [],
            "mode": doc.mode,
            "source_md5": doc.md5
        }

        for idx in ordered_indices:
            if scores[idx] == 0:
                break
            trans = best_translations[idx]
            utt = doc.utterances[idx]["translations"][trans]
            src_utt = doc.utterances[idx]["source"]

            exact_matches = self.get_exact_matches(doc, idx, trans)
            stem_matches = self.get_stem_matches(doc, idx, trans)
            close_matches = stem_matches - exact_matches

            if self.cutoffs is not None:
                max_length = min(self.cutoffs[doc.mode][trans], budget - size)
            else:
                max_length = budget - size
            line, wc = self.make_utterance_markup(utt, max_length,
                                                  exact_matches, close_matches)

            size += wc
            markup_lines.append(line)
            meta["translation"].append(trans)
            meta["utterance_ids"].append(int(idx))
            meta["source_offsets"].append(src_utt.offsets)

            if size >= budget:
                break

        markup = "\n".join(markup_lines)
        missing_terms = [t.word.lower() for t in query.content.tokens
                         if t.word.lower() not in found_terms and \
                            t.word.lower() not in en_stopwords]
        instr = get_instructions(query.string, found_terms, missing_terms)

        return markup, instr, meta
コード例 #6
0
ファイル: morphv1.py プロジェクト: eturcan/scripts
    def __call__(self, doc, budget=100):
        query = doc.annotations["QUERY"]

        best_translations = self.get_best_translations(doc)
        scores = self.get_scores(doc)

        I = np.argsort(scores)[::-1]
        if scores[I[0]] == 0:
            return ConceptV2(*self.default_args,
                             **self.default_kwargs)(doc, budget=budget)

        if self.header:
            header = "Match found for {}, check number/tense/meaning:".format(
                " ".join([t.word for t in query.content.tokens]))
            size = len(header.split())
            markup_lines = ["<h1>{}</h1>".format(header)]
        else:
            markup_lines = []
            size = 0
        meta = {
            "translation": [],
            "markup": "morphv1",
            "utterance_ids": [],
            "source_offsets": [],
            "mode": doc.mode,
            "source_md5": doc.md5
        }
        for idx in I:
            score = scores[idx]
            if score == 0:
                break
            trans = best_translations[idx]
            sent = doc.utterances[idx]["translations"][trans]
            src_utt = doc.utterances[idx]["source"]
            tokens = [token.word for token in sent.tokens]
            mname = self.translation_annotators[trans][0][0]
            for m in doc.annotations[mname]["annotation"][idx]:
                for j, s in enumerate(m["match_quality"], m["token_position"]):
                    if s >= 1:
                        tokens[j] = '<span_class="RELEXACTMATCH">' \
                            + tokens[j] + '</span>'
                    else:
                        tokens[j] = '<span_class="RELEXACT">' \
                            + tokens[j] + '</span>'

            line = detokenize(" ".join(tokens))
            wc = len(line.split())
            if wc + size > budget:
                wc = budget - size
                line = " ".join(line.split()[:wc]) + "..."

            size += wc
            line = line.replace("RELEXACTMATCH", "rel_exact_match")
            line = line.replace("RELEXACT", "rel_close_match")
            line = line.replace("span_class", "span class")
            markup_lines.append("<p>{}</p>".format(line))
            meta["translation"].append(trans)
            meta["utterance_ids"].append(int(idx))
            meta["source_offsets"].append(src_utt.offsets)
            if size >= budget:
                break

        found_terms = self.get_found_words(doc, best_translations, query)
        missing_terms = [t.word.lower() for t in query.content.tokens
                         if t.word.lower() not in found_terms \
                         and t.word.lower() not in en_stopwords]
        #
        instructions = get_instructions(query.string, found_terms,
                                        missing_terms)
        return "\n".join(markup_lines), instructions, meta

        #        if query.morphological_constraint.morph.pos != "NN":
        #            return "<p>SKIP</p>"
        pos = "np"

        matches = []
        match_qualities = []
        for i, utt in enumerate(doc):

            tr2ann = {}
            tr2score = {}
            for trans in translations:
                k = trans + ".morph_match_" + pos
                ann = doc.annotations[k]["annotations"][i]
                if len(ann) > 0:
                    tr2ann[trans] = ann
                    tr2score[trans] = max([x["match_score"] for x in ann])

            if len(tr2ann) == 0:
                continue

            srt_trans = sorted(tr2ann.keys(),
                               key=lambda x: tr2score[x],
                               reverse=True)
            if len(srt_trans) > 1 \
                    and tr2score[srt_trans[0]] == tr2score[srt_trans[1]]:
                if "nmt" in srt_trans[0]:
                    best_trans = srt_trans[0]
                else:
                    best_trans = srt_trans[1]
            else:
                best_trans = srt_trans[0]

            for ann in tr2ann[best_trans]:
                match_qualities.append([x >= 1 for x in ann["match_quality"]])
            matches.append({
                "sent":
                i,
                "trans":
                best_trans,
                "anns":
                tr2ann[best_trans],
                "score":
                tr2score[best_trans],
                "exact_morph":
                any([x["exact_morph"] for x in tr2ann[best_trans]])
            })

        # sort is stable sorting should put exact matches first by higest score
        # then soft matches, by highest score
        matches.sort(key=lambda x: x["score"], reverse=True)
        matches.sort(key=lambda x: x["exact_morph"], reverse=True)

        if len(match_qualities) == 0:
            result = ConceptV2(*self.default_args,
                               **self.default_kwargs)(doc, budget=budget)
            result[2]["markup"] = "morph-backoff-conceptv2"
            return result

        found_term_ind = np.array(match_qualities).sum(axis=0)
        found_terms = [
            q.word for q, ind in zip(query.content.tokens, found_term_ind)
            if ind
        ]

        markup_lines = []

        header, size = make_word_match_header(query, found_terms)
        markup_lines.append(header)

        for match in matches:
            sent = doc.utterances[match["sent"]]["translations"][
                match["trans"]]
            tokens = [token.word for token in sent.tokens]
            for m in match["anns"]:
                for j, s in enumerate(m["match_quality"], m["token_position"]):
                    if s >= 1:
                        tokens[j] = '<span_class="RELEXACTMATCH">' \
                            + tokens[j] + '</span>'
                    else:
                        tokens[j] = '<span_class="RELEXACT">' \
                            + tokens[j] + '</span>'

            line = detokenize(" ".join(tokens))
            wc = len(line.split())
            if wc + size > budget:
                wc = budget - size
                line = " ".join(line.split()[:wc]) + "..."

            size += wc
            line = line.replace("RELEXACTMATCH", "rel_exact_match")
            line = line.replace("RELEXACT", "rel_exact")
            line = line.replace("span_class", "span class")
            markup_lines.append("<p>{}</p>".format(line))
            if size >= budget:
                break

        missing_terms = [
            t.word.lower() for t in query.content.tokens
            if t.word.lower() not in found_terms
        ]

        instructions = get_instructions(query.string, found_terms,
                                        missing_terms)
        return "\n".join(markup_lines), instructions