def prepare_data(self): if not self.is_prepared: self.histogram_list = Utils.get_distribution(self.value_list) self.numeric_list = Utils.clean_examples_numeric(self.value_list) if self.is_numeric(): self.sample_list = sc.parallelize(self.numeric_list).sample(False, 100.0 / len(self.numeric_list)).collect() else: self.value_text = sc.parallelize(self.value_list).map(lambda x: " %s " % x).reduce(lambda x, y: x + y) self.is_prepared = True
def __init__(self, column, train_examples_map, sc): self.train_examples_map = train_examples_map self.test_examples = column.value_list self.true_label = column.semantic_type self.name = column.name self.numeric_test_examples = Utils.clean_examples_numeric(self.test_examples) self.is_numeric = column.is_numeric() self.hist_examples = Utils.get_distribution(self.test_examples) self.sc = sc