def reset(self): if self._sc: self._sc.stop() self._sc = utils.get_spark_context() self._data_set_rdd = self._get_dataset_rdd() self._data_set_rdd.cache() self._data_set_size = self._data_set_rdd.count()
def reset(self): if self._sc: self._sc.stop() self._sc = utils.get_spark_context() self._data_set_rdd = self._get_dataset_rdd() self._data_set_rdd.cache() self._data_set_size = self._data_set_rdd.count() self._threshold = THRESHOLD_RATIO * self._data_set_size
def reset(self): if self._sc: self._sc.stop() self._sc = utils.get_spark_context() self._data_set_rdd = self._get_dataset_rdd() self._data_set_rdd.cache() self._data_set_size = self._data_set_rdd.count() print 'num transactions in data - %d' % self._data_set_size self._threshold = THRESHOLD_RATIO * self._data_set_size
def __init__(self, times): self._times = times self._data_path = None self._num_machines = NUM_MACHINES self._sc = utils.get_spark_context() self._epsilon = 0.1 if not os.path.exists("results/%s" % DATA_SET_NAME): os.mkdir("results/%s" % DATA_SET_NAME) if not os.path.exists(TEST_DIR): os.mkdir(TEST_DIR) global RES if RES is None: RES = {'xsmall': self._init_res_dict(), 'small': self._init_res_dict(), 'medium': self._init_res_dict(), 'large': self._init_res_dict(), 'xlarge': self._init_res_dict()}
def setUp(self): self._data_path = DATA_PATH self._num_machines = NUM_MACHINES self._sc = utils.get_spark_context() self._data_set_rdd = self._get_dataset_rdd() self._data_set_rdd.cache() self._data_set_size = self._data_set_rdd.count() self._threshold = THRESHOLD_RATIO * self._data_set_size self._epsilon = 0.1 if not os.path.exists("results/%s" % DATA_SET_NAME): os.mkdir("results/%s" % DATA_SET_NAME) if not os.path.exists(TEST_DIR): os.mkdir(TEST_DIR) global RES if RES is None: RES = {'xsmall': self._init_res_dict(), 'small': self._init_res_dict(), 'medium': self._init_res_dict(), 'large': self._init_res_dict(), 'xlarge': self._init_res_dict()}