Ejemplo n.º 1
0
    def compute(self, values, accumulator=None):
        """Compute a step in this computation, returning a new accumulator."""
        values = base_preprocessing_layer.convert_to_list(
            values, sparse_default_value=self._mask_value)

        if accumulator is None:
            accumulator = self._create_accumulator()

        # TODO(momernick): Benchmark improvements to this algorithm.
        if not isinstance(values, list):
            values = [values]
        for document in values:
            if not isinstance(document, list):
                document = [document]
            if self._compute_idf:
                current_doc_id = accumulator.data["next_doc_id"]
                accumulator.data["next_doc_id"] += 1
            for token in document:
                accumulator.count_dict[token] += 1
                if self._compute_idf:
                    doc_count = accumulator.per_doc_count_dict[token]
                    if doc_count["last_doc_id"] != current_doc_id:
                        doc_count["count"] += 1
                        doc_count["last_doc_id"] = current_doc_id

        return accumulator
Ejemplo n.º 2
0
    def compute(self, values, accumulator=None):
        """Computes a step in this computation, returning a new accumulator."""
        values = base_preprocessing_layer.convert_to_list(values)

        if accumulator is None:
            accumulator = self._create_accumulator()

        # TODO(momernick): Benchmark improvements to this algorithm.
        for element in values:
            if not isinstance(element, list):
                element = [element]
            current_doc_id = accumulator.data[self.DOC_ID_IDX]
            for value in element:
                if self.max_tokens is None:
                    current_max_value = accumulator.data[self.MAX_VALUE_IDX]
                    if value > current_max_value:
                        accumulator.data[self.MAX_VALUE_IDX] = value
                if self._compute_idf:
                    doc_count = accumulator.per_doc_count_dict[value]
                    if doc_count["last_doc_id"] != current_doc_id:
                        doc_count["count"] += 1
                        doc_count["last_doc_id"] = current_doc_id
            accumulator.data[self.DOC_ID_IDX] += 1

        return accumulator
Ejemplo n.º 3
0
  def compute(self, values, accumulator=None):
    """Compute a step in this computation, returning a new accumulator."""
    values = base_preprocessing_layer.convert_to_list(
        values, sparse_default_value=self._mask_value)

    if accumulator is None:
      accumulator = self._create_accumulator()

    # TODO(momernick): Benchmark improvements to this algorithm.
    if isinstance(values, (str, bytes, np.int64)):
      accumulator.count_dict[values] += 1
    else:
      for document in values:
        if not isinstance(document, list):
          accumulator.count_dict[document] += 1
        else:
          for token in document:
            accumulator.count_dict[token] += 1

    return accumulator
Ejemplo n.º 4
0
 def test_conversion(self, inputs, expected):
     values = base_preprocessing_layer.convert_to_list(inputs())
     self.assertAllEqual(expected, values)