def __init__(self, no_dup=False, dump_pth=None): self.conf = LumosConf() self.no_dup = no_dup self.ds_root_pth = self.conf.get('dataset', 'path') self.vendor_cnt = self.conf.get('dataset', 'vendor_cnt') self.__data = defaultdict(lambda: defaultdict(lambda: [])) self.dump_pth = dump_pth
def __stat(self, raw_series, idx): conf = LumosConf() selected_idxes = conf.get('dataset', 'selected_idx') valid_max_val = conf.get_global_max_val(selected_idxes[idx]) max_val = np.max(raw_series) / valid_max_val min_val = np.min(raw_series) / valid_max_val avg_val = np.mean(raw_series) / valid_max_val # var_val = np.var(raw_series) / valid_max_val return [max_val, min_val, avg_val]
def __init__(self, dump_pth=None, ordinal=True): conf = LumosConf() self.ds_root_pth = conf.get('dataset', 'path') self.vendor_cnt = conf.get('dataset', 'vendor_cnt') self.__data = None self.dump_pth = dump_pth # sampling interval self.sampling_interval = 5 # the label is ordinal or raw self.ordinal = ordinal
class DataLoader(object): ''' Load training or testing data ''' def __init__(self, no_dup=False, dump_pth=None): self.conf = LumosConf() self.no_dup = no_dup self.ds_root_pth = self.conf.get('dataset', 'path') self.vendor_cnt = self.conf.get('dataset', 'vendor_cnt') self.__data = defaultdict(lambda: defaultdict(lambda: [])) self.dump_pth = dump_pth def load_data(self): if self.dump_pth: self.__load_data_from_file() return def is_vendor(v): return '.' not in v no_dup_set = set() for vendor in os.listdir(self.ds_root_pth): if not is_vendor(vendor): continue pth1 = os.path.join(self.ds_root_pth, vendor) for inst_type in os.listdir(pth1): pth2 = os.path.join(pth1, inst_type) for w in os.listdir(pth2): [scale, rnd] = w.strip().split('_')[-2:] workload = '_'.join(w.strip().split('_')[:2]) if self.no_dup: w_key = '_'.join((inst_type, workload, scale)) if w_key in no_dup_set: continue no_dup_set.add(w_key) pth3 = os.path.join(pth2, w) pth_report = os.path.join(pth3, 'report.json') pth_metrics = os.path.join(pth3, 'sar.csv') [ts, jct] = mget_json_values(pth_report, 'timestamp', 'elapsed_time') ts = encode_timestamp(ts) jct = float(jct) header, metrics = read_csv(pth_metrics) if not header or not metrics: continue norm_metrics = normalize_metrics(metrics) self.__data[workload][vendor].append( RecordEntry(inst_type, scale, norm_metrics, jct, ts)) def __load_data_from_file(self): with open(self.dump_pth, 'rb') as fd: self.__data = dill.load(fd) def get_data(self): return self.__data
def get_train_test_data_external(test_wl, train_scale, truncate=False, ordinal=False): assert train_scale == 'small', 'currently the model evaluated using small as the trianing scale' conf = LumosConf() dmp_pre = conf.get('dataset', 'train_test_dump_prefix') dmp_suf = 'o%d_t%d' % (ordinal, truncate) wl_pth = os.path.join(dmp_pre, '%s_%s.pkl' % (test_wl, dmp_suf)) with open(wl_pth, 'rb') as fd: (train_data, test_data) = dill.load(fd) return train_data, test_data
def as_vector_old(self): ''' turn this record to a vector that can be fed into a prediction model ''' # assert self.tag == MetricsTag.ENC, 'metrics un-encoded, unable to vectorize' assert self.tag == 'enc', 'metrics un-encoded, unable to vectorize' conf = LumosConf() inst_id = conf.get_inst_id(self.inst_type) scale_id = conf.get_scale_id(self.scale) X = np.array([inst_id, scale_id, self.ts[0], self.ts[1]]) X = np.concatenate((X, self.metrics), axis=0) Y = self.jct return X, Y
def as_vector(self): ''' turn this record to a vector that can be fed into a prediction model ''' # assert self.tag == MetricsTag.ENC, 'metrics un-encoded, unable to vectorize' assert self.tag == 'enc', 'metrics un-encoded, unable to vectorize' conf = LumosConf() inst_id = conf.get_inst_id(self.inst_type) d_info = conf.get_inst_detailed_conf(self.inst_type) n_fam, n_cpu, n_mem = d_info['family'], d_info['cpu'], d_info['memory'] scale_id = conf.get_scale_id(self.scale) X = np.array( [inst_id, n_fam, n_cpu, n_mem, scale_id, self.ts[0], self.ts[1]]) X = np.concatenate((X, self.metrics), axis=0) Y = self.jct return X, Y
def encode(self, norm_data, raw_data, sampling_interval=5): ret = [] conf = LumosConf() valid_idx = conf.get('dataset', 'selected_idx') if norm_data.shape[1] != len(valid_idx): norm_data = norm_data[:, valid_idx] if raw_data.shape[1] != len(valid_idx): raw_data = raw_data[:, valid_idx] for i in range(norm_data.shape[1]): tmp = [] norm_series = norm_data[:, i] raw_series = raw_data[:, i] fft_feat = self.__fft(norm_series, sampling_interval=sampling_interval) stat_feat = self.__stat(raw_series, i) tmp.extend(fft_feat) tmp.extend(stat_feat) ret.extend(tmp) return ret
optimal = rank_data['1'][wl][scale][0].jct abs_err = optimal_bar - optimal rel_err = abs_err / optimal err[scale]['abs_err'] = abs_err err[scale]['rel_err'] = rel_err avg_abs_err = np.mean(err['large']['abs_err']) avg_rel_err = np.mean(err['large']['rel_err']) return avg_abs_err, avg_rel_err if __name__ == "__main__": parser = argparse.ArgumentParser(description='grid search') parser.add_argument('-j', '--n_jobs', help='number of jobs running parallel', type=int, default=None) args = parser.parse_args() conf = LumosConf() dump_pth = conf.get('dataset', 'dump_pth_ordinal_with_truc_v1') # dataloader = DataLoaderOrdinal() dataloader = DataLoaderOrdinal(dump_pth=dump_pth) dataloader.load_data_by_interval(interval=1) rank_data = dataloader.get_data_rankize() # dataset options op_truncate = [True, False] op_ordinal = [True, False] # model options op_max_depth = [3, 4, 5] op_n_estimators = [10, 40, 70, 100] op_criterion = ['mse', 'mae'] op_max_features = ['auto', 'sqrt', 'log2', 0.5]
redundant_feat_idxes = [] for idx in range(len(avg_cof)): if idx not in valid_idxes: continue if idx in redundant_feat_idxes: continue selected_feat_idxes.append(idx) for idx_2 in range(len(avg_cof[idx])): if idx_2 in selected_feat_idxes: continue if idx_2 in redundant_feat_idxes: continue if avg_cof[idx][idx_2] > cof_threshold and idx_2 != idx: redundant_feat_idxes.append(idx_2) return selected_feat_idxes if __name__ == "__main__": from conf import LumosConf conf = LumosConf() dump_pth = conf.get('dataset', 'dump_pth_ordinal_with_truc_v1') dataloader = DataLoaderOrdinal(dump_pth=dump_pth) dataloader.load_data_by_interval(interval=1) data = dataloader.get_data() to_select_scale = 'large' metrics_data = [] for wl, wl_data in data['1'].items(): scale_data = wl_data[to_select_scale] # metrics_data.append(random.sample(scale_data, 1)[0].metrics) metrics_data.append(scale_data[1].metrics) feature_idxes = select_features(metrics_data) # ana_metrics(metrics_data) print('%d features selected: %s' % (len(feature_idxes), feature_idxes))
import os import sys import json import dill import random import pickle import numpy as np from utils import * from conf import LumosConf from collections import defaultdict from data_loader_ordinal import DataLoaderOrdinal from third_party.keras_lstm_vae.lstm_vae import create_lstm_vae if __name__ == "__main__": conf = LumosConf() dump_pth = conf.get('dataset', 'dump_pth_ordinal') dataloader = DataLoaderOrdinal(dump_pth=dump_pth) dataloader.load_data()
header, metrics = read_csv(pth_metrics) if not header or not metrics: continue norm_metrics = normalize_metrics(metrics) self.__data[workload][vendor].append( RecordEntry(inst_type, scale, norm_metrics, jct, ts)) def __load_data_from_file(self): with open(self.dump_pth, 'rb') as fd: self.__data = dill.load(fd) def get_data(self): return self.__data if __name__ == "__main__": conf = LumosConf() dump_pth = None if conf.get('dataset', 'no_dup'): dump_pth = conf.get('dataset', 'dump_pth_no_dup') else: dump_pth = conf.get('dataset', 'dump_pth') #data_loader = DataLoader(dump_pth=dump_pth) data_loader = DataLoader(no_dup=True) data_loader.load_data() data = data_loader.get_data() print(len(data)) print(data.keys()) print(len(data['hadoop_aggregation']['alibaba'])) print(len(data['hadoop_aggregation']['huawei'])) print(len(data['hadoop_aggregation']['tencent'])) # print(len(data['hadoop_aggregation']['ucloud']))
def get_train_test_data(self, train_scale='tiny', test_wl='', flag='single'): ''' get the training data that profiled on a concrete instance type param: @t_inst_type: the instance type that is used for profiling @test_wl: the workload that is to be used for testing ''' rankize_data = self.get_data_rankize() assert test_wl in self.__data['1'] or test_wl in ( 'HiBench', 'BigBench'), 'invalid test workload' assert flag in ('single', 'multi'), 'indicating single/multi testing workloads' def is_test_wl(wl): if flag == 'single': return wl == test_wl else: if test_wl == 'BigBench': return 'hive' in wl elif test_wl == 'HiBench': return 'hive' not in wl conf = LumosConf() truncate = conf.get('dataset', 'truncate') fft_stat_encoder = FFTStatEncoder(truncate=truncate) train_data = defaultdict(lambda: defaultdict(lambda: { 'X': [], 'Y': [] })) test_data = defaultdict(lambda: defaultdict(lambda: \ defaultdict(lambda: defaultdict(lambda: { 'X': [], 'Y': [] })))) predict_scales = ['tiny', 'small', 'large', 'huge'] if train_scale == 'small': predict_scales.remove('tiny') for rnd, rnd_data in rankize_data.items(): for wl, wl_data in rnd_data.items(): if is_test_wl(wl): continue for record1 in wl_data[train_scale]: t_inst_type = record1.inst_type test_conf = conf.get_inst_detailed_conf(t_inst_type, format='list') test_metrics_vec = fft_stat_encoder.encode( record1.metrics, record1.raw_metrics, sampling_interval=self.sampling_interval) for scale in predict_scales: target_scale = conf.get_scale_id(scale) for record2 in wl_data[scale]: target_conf = conf.get_inst_detailed_conf( record2.inst_type, format='list') target_rank = record2.rank target_jct = record2.jct X = test_conf.copy() X.extend(target_conf) X.append(target_scale) X.extend(test_metrics_vec) train_data[rnd][t_inst_type]['X'].append(X) if self.ordinal: train_data[rnd][t_inst_type]['Y'].append( target_rank) else: train_data[rnd][t_inst_type]['Y'].append( target_jct) for rnd, rnd_data in rankize_data.items(): for wl, wl_data in rnd_data.items(): if not is_test_wl(wl): continue # wl_data = rnd_data[test_wl] for record1 in wl_data[train_scale]: t_inst_type = record1.inst_type test_conf = conf.get_inst_detailed_conf(t_inst_type, format='list') test_metrics_vec = fft_stat_encoder.encode( record1.metrics, record1.raw_metrics, sampling_interval=self.sampling_interval) for scale in predict_scales: target_scale = conf.get_scale_id(scale) for record2 in wl_data[scale]: target_conf = conf.get_inst_detailed_conf( record2.inst_type, format='list') target_rank = record2.rank target_jct = record2.jct X = test_conf.copy() X.extend(target_conf) X.append(target_scale) X.extend(test_metrics_vec) test_data[wl][rnd][t_inst_type][scale]['X'].append( X) if self.ordinal: test_data[wl][rnd][t_inst_type][scale][ 'Y'].append(target_rank) else: test_data[wl][rnd][t_inst_type][scale][ 'Y'].append(target_jct) return train_data, test_data