Beispiel #1
0
    def handle_top_signatures(self, form):
        max_rows = 200
        fact_weights = self.fact_match_weights(form)
        score_type = form.getvalue("main_form_sort_metric")
        signature_type = form.getvalue("main_form_signature_type")

        mapping = browser_globals.full_signature_to_parse
        if signature_type == "coarse":
            mapping = browser_globals.coarse_signature_to_parse

        # Sort all (signature, category, parse) tuples with the specified metric.
        scored = []
        for signature, parse_list in mapping.iteritems():
            for qid, category, parse in parse_list:
                score = self.parse_score(category, parse, score_type,
                                         fact_weights)
                scored.append((signature, qid, category, parse, score))
        scored.sort(key=lambda x: -x[4])

        # Group by signature, and aggregate relevant information.
        all_stats = {}
        seen = set()
        for signature, qid, category, parse, score in scored:
            if signature not in all_stats:
                all_stats[signature] = SignatureStats()
            stats = all_stats[signature]
            stats.example(qid, category, parse)
            key = (signature, qid)
            if key not in seen:
                seen.add(key)
                stats.members += len(category.members)
                stats.score += score
                stats.num += 1
                util.fact_matches_for_parse(parse, stats.fact_stats)

        # Take only the 'max_rows' top signatures as per the aggregated scores.
        all_stats = list(all_stats.iteritems())
        all_stats.sort(key=lambda x: -x[1].score)
        all_stats = all_stats[:max_rows]

        # Display them in a tabular form.
        self.write_main_table_header([
            "Signature", "Example Category", "Score", "#Members / #Categories"
        ], [t.name for t in FactMatchType])

        for signature, stats in all_stats:
            self._begin("tr")
            self._begin("td")
            self._form_anchor(signature, signature)
            self._end("td")
            self._begin("td")
            self._form_anchor(stats.example_category.name, stats.example_qid)
            self._end("td")
            self._cell(stats.score, numeric=True)
            self._cell("%d / %d" % (stats.members, stats.num), numeric=True)
            self._separator(header=False)
            self.write_fact_match_counts(stats.fact_stats)
            self._end("tr")
        self._end("table")
Beispiel #2
0
  def handle_category(self, qid, form):
    def is_on(name):
      return form.getvalue("main_form_" + name) == "on"

    # Various options.
    show_span_qid = is_on("show_span_qid")
    show_prelim_parse_scores = is_on("show_prelim_parse_scores")
    show_span_scores = is_on("show_span_scores")
    show_fact_matches = is_on("show_fact_matching_statistics")
    show_span_fact_matches = is_on("show_span_fact_match_stats")
    show_similar_categories = is_on("show_similar_categories")
    signature_type = form.getvalue("main_form_signature_type")
    metric = form.getvalue("main_form_sort_metric")
    fact_weights = self.fact_match_weights(form)

    frame = browser_globals.category_frame[qid]
    document = sling.Document(frame=frame.document)

    num = len([p for p in frame("parse")])
    self._tag("div", "<b>%s = %s</b>: %d members, %d parses" % \
              (qid, frame.name, len(frame.members), num))
    self._br()

    # Write the parses in a tabular format.
    show_prelim_parse_scores &= metric != "prelim_parse_score"
    self.write_main_table_header(
      "Signature",
      [t.word for t in document.tokens],
      "Metric",
      "Prelim Scores" if show_prelim_parse_scores else None,
      [t.name for t in FactMatchType] if show_fact_matches else None,
      "Matching Categories" if show_similar_categories else None)

    # Each parse is written as one row.
    parses = [(parse, self.parse_score(frame, parse, metric, fact_weights)) \
      for parse in frame("parse")]
    parses.sort(key=lambda x: -x[1])
    for parse, metric_value in parses:
      signature = util.parse_signature(parse, signature_type)

      self._begin("tr")
      self._begin("td")
      self._form_anchor(signature, signature)
      self._end("td")
      self._separator(header=False)
      prev_span_end = -1
      for span in parse.spans:
        for index in xrange(prev_span_end + 1, span.begin):
          self._empty_cell()

        self._begin("td", colspan=span.end-span.begin, align='middle')
        text = util.span_signature(span, signature_type)
        if show_span_qid:
          text += " (" + str(span.qid) + ")"
        title = '.'.join([str(p) for p in span.pids]) + ' = ' + str(span.qid)
        if "name" in span.qid:
          title += " (" + span.qid[name] + ")"
        self._tag("span", text, title=title)

        if show_span_scores and "prior" in span:
          self._br()
          self._text("%s = %0.4f" % ("prior", span.prior))

        if show_span_fact_matches:
          local_counts = util.fact_matches_for_span(span)
          self._br()
          self._begin("table class='span_fact_match'")
          self._begin("thead")
          for t in FactMatchType:
            self._tag("th", t.name)
          self._end("thead")
          self._begin("tr")
          self.write_fact_match_counts(local_counts)
          self._end(["tr", "table"])

        self._end("td")
        prev_span_end = span.end - 1

      for index in xrange(prev_span_end + 1, len(document.tokens)):
        self._empty_cell()

      self._separator(header=False)
      if type(metric_value) is int:
        self._cell(metric_value)
      else:
        self._cell("%.4f" % metric_value)

      if show_prelim_parse_scores:
        self._separator(header=False)
        self._begin("td class='numeric'")
        for score_type in ["prior", "member_score", "cover"]:
          if score_type in parse:
            self._text("%s = %0.4f" % (score_type, parse[score_type]))
            self._br()
        if "score" in parse:
          self._color_text("Overall = %0.4f" % parse.score, "blue")
        self._end("td")

      if show_fact_matches:
        self._separator(header=False)
        total_fact_counts = util.fact_matches_for_parse(parse)
        self.write_fact_match_counts(total_fact_counts)

      if show_similar_categories:
        self._separator(header=False)
        self._begin("td")
        limit = 5
        signature_mapping = browser_globals.full_signature_to_parse
        if signature_type == "coarse":
          signature_mapping = browser_globals.coarse_signature_to_parse
        seen = set()
        for (other_qid, other_category, other_parse) in \
          signature_mapping[signature]:
          if len(seen) >= limit:
            break
          if other_qid != qid and other_qid not in seen:
            seen.add(other_qid)
            self._text(other_category.name)
            self._form_anchor(" (= %s)" % other_qid, other_qid)
            self._text(" (%0.4f)" % other_parse.score)
            self._br()
        self._end("td")
      self._end("tr")
    self._end("table")
Beispiel #3
0
  def handle_signature(self, signature, categories, form):
    score_type = form.getvalue("main_form_sort_metric")
    fact_weights = self.fact_match_weights(form)
    signature_type = form.getvalue("main_form_signature_type")

    # Sort all parses with this signature.
    output = []
    for (qid, category, parse) in categories:
      score = self.parse_score(category, parse, score_type, fact_weights)
      output.append((qid, category, parse, score))
    output.sort(key=lambda x: -x[3])

    # Get fact-matching statistics. Consider only the top parse for a category
    # if it has >1 parses with the same signature.
    category_count = defaultdict(int)
    match_counts = util.MatchCounts()
    span_match_counts = defaultdict(util.MatchCounts)
    num_members = 0
    for qid, category, parse, score in output:
      category_count[qid] += 1
      if category_count[qid] > 1:
        continue
      num_members += len(category.members)
      util.fact_matches_for_parse(parse, match_counts)
      for span in parse.spans:
        span_signature = util.span_signature(span, signature_type)
        util.fact_matches_for_span(span, span_match_counts[span_signature])

    # Write a summary.
    self._tag("div",
      "<b>%s</b>: in %d (category, parse) pairs across %d categories" % \
      (signature, len(categories), len(category_count)))
    self._tag("b", "#Items across categories: ")
    self._text("%d (incl. possible duplicates)" % num_members)

    self._br()
    self._tag("b", "Span-level fact-matching statistics")
    self.write_main_table_header(
      ["Span Signature"],
      [t.name for t in FactMatchType])
    for span_signature, span_matches in span_match_counts.iteritems():
      self._begin("tr")
      self._cell(span_signature)
      self._separator(header=False)
      self.write_fact_match_counts(span_matches)
      self._end("tr")
    self._begin("tr")
    self._cell("All")
    self._separator(header=False)
    self.write_fact_match_counts(match_counts)
    self._end("tr")
    self._end("table")

    # Give an option to generate a recordio file.
    if signature_type == "full":
      self._br()
      self._begin("table class='main_table'")
      self._begin("tr")
      self._begin("td")
      self._begin("form", id="recordio_form", method="POST", action="", \
                  target="_blank")
      self._begin_end("input", type="hidden", name="main_form_functionality", \
                      value="recordio")
      self._begin_end("input", type="hidden", name="recordio_signature", \
                      id="recordio_signature", value=signature)
      self._tag("b", "Generate recordio for this signature")
      self._br()
      self._br()
      self._text("Filename: ")
      filename = "local/data/e/wikicat/" + signature.replace("$", "_") + ".rec"
      self._begin_end("input", type="text", size=100, \
                      name="recordio_filename", value=filename)
      self._br()
      self._text("Category QIDs ('ALL' for all): ")
      self._begin_end("input", type="text", size=100, value="ALL", \
                      name="recordio_categories")
      self._br()
      self._text("Generate facts for these types:")
      self._begin_end("input", type="text", size=100, \
                      value="NEW,ADDITIONAL,SUBSUMED_BY_EXISTING", \
                      name="recordio_match_types")
      self._br()
      self._text("Generate the following facts:")
      self._br()
      count = 0
      for token in signature.split():
        if token[0] == '$' and token[1:].find("$") >= 0:
          name = "recordio_span%d" % count
          self._text("&nbsp;&nbsp;" + token + " ")
          self._begin_end("input", type="checkbox", name=name, id=name, \
                          checked="on")
          self._br()
          count += 1
      self._begin_end("input", type="hidden", name="recordio_num_spans", \
                      id="recordio_total_spans", value=count)
      self._begin_end("input", type="submit")
      self._end(["form", "td", "tr", "table"])

    # Write the individual parses in a tabular form.
    self._br()
    self._tag("b", "Categories with parses matching '" + signature + "'")
    seen = set()
    max_rows = 200
    self.write_main_table_header(
      ["Category", "Prelim parse score", "#Members", "Fact-matching score"],
      [t.name for t in FactMatchType])
    row_count = 0
    for qid, category, parse, score in output:
      if row_count >= max_rows:
        break
      if qid in seen:
        continue
      row_count += 1
      seen.add(qid)
      self._begin("tr")
      self._begin("td")
      self._form_anchor(qid + ": " + category.name, qid)
      if category_count[qid] > 1:
        more = category_count[qid] - 1
        self._text(" (%d more parse%s)" % (more, "" if more == 1 else "s"))
      self._end("td")
      self._cell(parse.score, numeric=True)
      self._cell(len(category.members), numeric=True)
      self._cell("%.4f" % self.parse_fact_score(parse, fact_weights), True)

      self._separator(header=False)
      counts = util.fact_matches_for_parse(parse)
      self.write_fact_match_counts(counts)
      self._end("tr")
    self._end("table")
Beispiel #4
0
 def parse_fact_score(self, parse, weights):
   score = 0.0
   match_counts = util.fact_matches_for_parse(parse, max_examples=0)
   for match_type, count in match_counts.counts.iteritems():
     score += count * weights[match_type]
   return score
Beispiel #5
0
    def handle_signature(self, signature, categories, form):
        score_type = form.getvalue("main_form_sort_metric")
        fact_weights = self.fact_match_weights(form)
        signature_type = form.getvalue("main_form_signature_type")

        # Sort all parses with this signature.
        output = []
        for (qid, category, parse) in categories:
            score = self.parse_score(category, parse, score_type, fact_weights)
            output.append((qid, category, parse, score))
        output.sort(key=lambda x: -x[3])

        # Get fact-matching statistics. Consider only the top parse for a category
        # if it has >1 parses with the same signature.
        category_count = defaultdict(int)
        match_counts = defaultdict(int)
        span_match_counts = defaultdict(lambda: defaultdict(int))
        num_members = 0
        for qid, category, parse, score in output:
            category_count[qid] += 1
            if category_count[qid] > 1:
                continue
            num_members += len(category.members)
            util.fact_matches_for_parse(parse, match_counts)
            for span in parse.spans:
                span_signature = util.span_signature(span, signature_type)
                util.fact_matches_for_span(span,
                                           span_match_counts[span_signature])

        # Write a summary.
        self._tag("div",
          "<b>%s</b>: in %d (category, parse) pairs across %d categories" % \
          (signature, len(categories), len(category_count)))
        self._tag("b", "#Items across categories: ")
        self._text("%d (incl. possible duplicates)" % num_members)

        self._br()
        self._tag("b", "Span-level fact-matching statistics")
        self.write_main_table_header(["Span Signature"],
                                     [t.name for t in FactMatchType])
        for span_signature, span_stats in span_match_counts.iteritems():
            self._begin("tr")
            self._cell(span_signature)
            self._separator(header=False)
            self.write_fact_match_counts(span_stats)
            self._end("tr")
        self._begin("tr")
        self._cell("All")
        self._separator(header=False)
        self.write_fact_match_counts(match_counts)
        self._end("tr")
        self._end("table")

        # Write the individual parses in a tabular form.
        self._br()
        self._tag("b", "Categories with parses matching '" + signature + "'")
        seen = set()
        max_rows = 200
        self.write_main_table_header([
            "Category", "Prelim parse score", "#Members", "Fact-matching score"
        ], [t.name for t in FactMatchType])
        row_count = 0
        for qid, category, parse, score in output:
            if row_count >= max_rows:
                break
            if qid in seen:
                continue
            row_count += 1
            seen.add(qid)
            self._begin("tr")
            self._begin("td")
            self._form_anchor(qid + ": " + category.name, qid)
            if category_count[qid] > 1:
                more = category_count[qid] - 1
                self._text(" (%d more parse%s)" %
                           (more, "" if more == 1 else "s"))
            self._end("td")
            self._cell(parse.score, numeric=True)
            self._cell(len(category.members), numeric=True)
            self._cell("%.4f" % self.parse_fact_score(parse, fact_weights),
                       True)

            self._separator(header=False)
            counts = util.fact_matches_for_parse(parse)
            self.write_fact_match_counts(counts)
            self._end("tr")
        self._end("table")
Beispiel #6
0
 def parse_fact_score(self, parse, weights):
     score = 0.0
     counts = util.fact_matches_for_parse(parse)
     for match_type, count in counts.iteritems():
         score += count * weights[match_type]
     return score