Example #1
0
 def get_new_schema(original_data, feature_mask):
     old_header = original_data.schema.get("header")
     sid_name = original_data.schema.get("sid_name")
     label_name = original_data.schema.get("label_name")
     new_header = [old_header[i] for i in np.where(feature_mask > 0)[0]]
     schema = make_schema(new_header, sid_name, label_name)
     return schema
Example #2
0
 def prepare_data(self, data_num, feature_num, header, sid_name, label_name):
     final_result = []
     for i in range(data_num):
         tmp = i * np.ones(feature_num)
         inst = Instance(inst_id=i, features=tmp, label=0)
         tmp = (i, inst)
         final_result.append(tmp)
     table = self.session.parallelize(final_result,
                                 include_key=True,
                                 partition=3)
     schema = data_io.make_schema(header, sid_name, label_name)
     table.schema = schema
     return table
Example #3
0
    def fit(self, data):
        if len(data) <= 0:
            LOGGER.warning("Union receives no data input.")
            return
        empty_count = 0
        combined_table = None
        combined_schema = None
        metrics = []

        for (key, local_table) in data.items():
            LOGGER.debug("table to combine name: {}".format(key))
            num_data = local_table.count()
            LOGGER.debug("table count: {}".format(num_data))
            local_schema = local_table.schema
            metrics.append(Metric(key, num_data))

            if num_data == 0:
                LOGGER.warning("Table {} has no entries.".format(key))
                empty_count += 1
                continue
            if combined_table is None:
                self.check_is_data_instance(local_table)
            if self.is_data_instance:
                self.is_empty_feature = data_overview.is_empty_feature(
                    local_table)
                if self.is_empty_feature:
                    LOGGER.warning("Table {} has no entries.".format(key))

            if combined_table is None:
                # first table to combine
                combined_table = local_table
            else:
                self.check_schema_id(local_schema, combined_schema)
                self.check_schema_label_name(local_schema, combined_schema)
                self.check_schema_header(local_schema, combined_schema)
                combined_table = combined_table.union(local_table,
                                                      self._keep_first)

            combined_schema = make_schema(local_table.schema.get("header"),
                                          local_table.schema.get("sid"),
                                          local_table.schema.get("label_name"))
            combined_table.schema = combined_schema
            # only check feature length if not empty
            if self.is_data_instance and not self.is_empty_feature:
                self.feature_count = len(combined_schema.get("header"))
                LOGGER.debug("feature count: {}".format(self.feature_count))
                combined_table.mapValues(self.check_feature_length)

        if combined_table is None:
            num_data = 0
            LOGGER.warning(
                "All tables provided are empty or have empty features.")
        else:
            num_data = combined_table.count()
        metrics.append(Metric("Total", num_data))

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=metrics)
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=MetricMeta(
                                         name=self.metric_name,
                                         metric_type=self.metric_type))

        LOGGER.debug("after union schema: {}".format(combined_table.schema))

        LOGGER.info(
            "Union operation finished. Total {} empty tables encountered.".
            format(empty_count))
        return combined_table