コード例 #1
0
 def __init__(self, dump_pth=None, ordinal=True):
     conf = LumosConf()
     self.ds_root_pth = conf.get('dataset', 'path')
     self.vendor_cnt = conf.get('dataset', 'vendor_cnt')
     self.__data = None
     self.dump_pth = dump_pth
     # sampling interval
     self.sampling_interval = 5
     # the label is ordinal or raw
     self.ordinal = ordinal
コード例 #2
0
class DataLoader(object):
    '''
    Load training or testing data
    '''
    def __init__(self, no_dup=False, dump_pth=None):
        self.conf = LumosConf()
        self.no_dup = no_dup
        self.ds_root_pth = self.conf.get('dataset', 'path')
        self.vendor_cnt = self.conf.get('dataset', 'vendor_cnt')
        self.__data = defaultdict(lambda: defaultdict(lambda: []))
        self.dump_pth = dump_pth

    def load_data(self):
        if self.dump_pth:
            self.__load_data_from_file()
            return

        def is_vendor(v):
            return '.' not in v

        no_dup_set = set()

        for vendor in os.listdir(self.ds_root_pth):
            if not is_vendor(vendor): continue
            pth1 = os.path.join(self.ds_root_pth, vendor)
            for inst_type in os.listdir(pth1):
                pth2 = os.path.join(pth1, inst_type)
                for w in os.listdir(pth2):
                    [scale, rnd] = w.strip().split('_')[-2:]
                    workload = '_'.join(w.strip().split('_')[:2])
                    if self.no_dup:
                        w_key = '_'.join((inst_type, workload, scale))
                        if w_key in no_dup_set: continue
                        no_dup_set.add(w_key)
                    pth3 = os.path.join(pth2, w)
                    pth_report = os.path.join(pth3, 'report.json')
                    pth_metrics = os.path.join(pth3, 'sar.csv')
                    [ts, jct] = mget_json_values(pth_report, 'timestamp',
                                                 'elapsed_time')
                    ts = encode_timestamp(ts)
                    jct = float(jct)
                    header, metrics = read_csv(pth_metrics)
                    if not header or not metrics: continue
                    norm_metrics = normalize_metrics(metrics)
                    self.__data[workload][vendor].append(
                        RecordEntry(inst_type, scale, norm_metrics, jct, ts))

    def __load_data_from_file(self):
        with open(self.dump_pth, 'rb') as fd:
            self.__data = dill.load(fd)

    def get_data(self):
        return self.__data
コード例 #3
0
ファイル: fft_stat.py プロジェクト: iteratorlee/lumos_expr
 def __stat(self, raw_series, idx):
     conf = LumosConf()
     selected_idxes = conf.get('dataset', 'selected_idx')
     valid_max_val = conf.get_global_max_val(selected_idxes[idx])
     max_val = np.max(raw_series) / valid_max_val
     min_val = np.min(raw_series) / valid_max_val
     avg_val = np.mean(raw_series) / valid_max_val
     # var_val = np.var(raw_series) / valid_max_val
     return [max_val, min_val, avg_val]
コード例 #4
0
 def get_train_test_data_external(test_wl,
                                  train_scale,
                                  truncate=False,
                                  ordinal=False):
     assert train_scale == 'small', 'currently the model evaluated using small as the trianing scale'
     conf = LumosConf()
     dmp_pre = conf.get('dataset', 'train_test_dump_prefix')
     dmp_suf = 'o%d_t%d' % (ordinal, truncate)
     wl_pth = os.path.join(dmp_pre, '%s_%s.pkl' % (test_wl, dmp_suf))
     with open(wl_pth, 'rb') as fd:
         (train_data, test_data) = dill.load(fd)
         return train_data, test_data
コード例 #5
0
ファイル: fft_stat.py プロジェクト: iteratorlee/lumos_expr
 def encode(self, norm_data, raw_data, sampling_interval=5):
     ret = []
     conf = LumosConf()
     valid_idx = conf.get('dataset', 'selected_idx')
     if norm_data.shape[1] != len(valid_idx):
         norm_data = norm_data[:, valid_idx]
     if raw_data.shape[1] != len(valid_idx):
         raw_data = raw_data[:, valid_idx]
     for i in range(norm_data.shape[1]):
         tmp = []
         norm_series = norm_data[:, i]
         raw_series = raw_data[:, i]
         fft_feat = self.__fft(norm_series,
                               sampling_interval=sampling_interval)
         stat_feat = self.__stat(raw_series, i)
         tmp.extend(fft_feat)
         tmp.extend(stat_feat)
         ret.extend(tmp)
     return ret
コード例 #6
0
            abs_err = optimal_bar - optimal
            rel_err = abs_err / optimal
            err[scale]['abs_err'] = abs_err
            err[scale]['rel_err'] = rel_err
    avg_abs_err = np.mean(err['large']['abs_err'])
    avg_rel_err = np.mean(err['large']['rel_err'])
    return avg_abs_err, avg_rel_err


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='grid search')
    parser.add_argument('-j', '--n_jobs', help='number of jobs running parallel', type=int, default=None)
    args = parser.parse_args()

    conf = LumosConf()
    dump_pth = conf.get('dataset', 'dump_pth_ordinal_with_truc_v1')
    # dataloader = DataLoaderOrdinal()
    dataloader = DataLoaderOrdinal(dump_pth=dump_pth)
    dataloader.load_data_by_interval(interval=1)
    rank_data = dataloader.get_data_rankize()

    # dataset options
    op_truncate = [True, False]
    op_ordinal = [True, False]
    # model options
    op_max_depth = [3, 4, 5]
    op_n_estimators = [10, 40, 70, 100]
    op_criterion = ['mse', 'mae']
    op_max_features = ['auto', 'sqrt', 'log2', 0.5]
    
    conf = LumosConf()
コード例 #7
0
    redundant_feat_idxes = []
    for idx in range(len(avg_cof)):
        if idx not in valid_idxes: continue
        if idx in redundant_feat_idxes: continue
        selected_feat_idxes.append(idx)
        for idx_2 in range(len(avg_cof[idx])):
            if idx_2 in selected_feat_idxes: continue
            if idx_2 in redundant_feat_idxes: continue
            if avg_cof[idx][idx_2] > cof_threshold and idx_2 != idx:
                redundant_feat_idxes.append(idx_2)

    return selected_feat_idxes


if __name__ == "__main__":
    from conf import LumosConf
    conf = LumosConf()
    dump_pth = conf.get('dataset', 'dump_pth_ordinal_with_truc_v1')
    dataloader = DataLoaderOrdinal(dump_pth=dump_pth)
    dataloader.load_data_by_interval(interval=1)
    data = dataloader.get_data()
    to_select_scale = 'large'
    metrics_data = []
    for wl, wl_data in data['1'].items():
        scale_data = wl_data[to_select_scale]
        # metrics_data.append(random.sample(scale_data, 1)[0].metrics)
        metrics_data.append(scale_data[1].metrics)
    feature_idxes = select_features(metrics_data)
    # ana_metrics(metrics_data)
    print('%d features selected: %s' % (len(feature_idxes), feature_idxes))
コード例 #8
0
import os
import sys
import json
import dill
import random
import pickle
import numpy as np

from utils import *
from conf import LumosConf
from collections import defaultdict
from data_loader_ordinal import DataLoaderOrdinal
from third_party.keras_lstm_vae.lstm_vae import create_lstm_vae

if __name__ == "__main__":
    conf = LumosConf()
    dump_pth = conf.get('dataset', 'dump_pth_ordinal')
    dataloader = DataLoaderOrdinal(dump_pth=dump_pth)
    dataloader.load_data()
コード例 #9
0
                    norm_metrics = normalize_metrics(metrics)
                    self.__data[workload][vendor].append(
                        RecordEntry(inst_type, scale, norm_metrics, jct, ts))

    def __load_data_from_file(self):
        with open(self.dump_pth, 'rb') as fd:
            self.__data = dill.load(fd)

    def get_data(self):
        return self.__data


if __name__ == "__main__":
    conf = LumosConf()
    dump_pth = None
    if conf.get('dataset', 'no_dup'):
        dump_pth = conf.get('dataset', 'dump_pth_no_dup')
    else:
        dump_pth = conf.get('dataset', 'dump_pth')
    #data_loader = DataLoader(dump_pth=dump_pth)
    data_loader = DataLoader(no_dup=True)
    data_loader.load_data()
    data = data_loader.get_data()
    print(len(data))
    print(data.keys())
    print(len(data['hadoop_aggregation']['alibaba']))
    print(len(data['hadoop_aggregation']['huawei']))
    print(len(data['hadoop_aggregation']['tencent']))
    # print(len(data['hadoop_aggregation']['ucloud']))
    with open(dump_pth, 'wb') as fd:
        dill.dump(data, fd)
コード例 #10
0
    def get_train_test_data(self,
                            train_scale='tiny',
                            test_wl='',
                            flag='single'):
        '''
        get the training data that profiled on a concrete instance type
        param:
        @t_inst_type: the instance type that is used for profiling
        @test_wl: the workload that is to be used for testing
        '''
        rankize_data = self.get_data_rankize()
        assert test_wl in self.__data['1'] or test_wl in (
            'HiBench', 'BigBench'), 'invalid test workload'
        assert flag in ('single',
                        'multi'), 'indicating single/multi testing workloads'

        def is_test_wl(wl):
            if flag == 'single':
                return wl == test_wl
            else:
                if test_wl == 'BigBench':
                    return 'hive' in wl
                elif test_wl == 'HiBench':
                    return 'hive' not in wl

        conf = LumosConf()
        truncate = conf.get('dataset', 'truncate')
        fft_stat_encoder = FFTStatEncoder(truncate=truncate)

        train_data = defaultdict(lambda: defaultdict(lambda: {
            'X': [],
            'Y': []
        }))
        test_data = defaultdict(lambda: defaultdict(lambda: \
            defaultdict(lambda: defaultdict(lambda: {
            'X': [],
            'Y': []
        }))))

        predict_scales = ['tiny', 'small', 'large', 'huge']
        if train_scale == 'small':
            predict_scales.remove('tiny')

        for rnd, rnd_data in rankize_data.items():
            for wl, wl_data in rnd_data.items():
                if is_test_wl(wl): continue
                for record1 in wl_data[train_scale]:
                    t_inst_type = record1.inst_type
                    test_conf = conf.get_inst_detailed_conf(t_inst_type,
                                                            format='list')
                    test_metrics_vec = fft_stat_encoder.encode(
                        record1.metrics,
                        record1.raw_metrics,
                        sampling_interval=self.sampling_interval)
                    for scale in predict_scales:
                        target_scale = conf.get_scale_id(scale)
                        for record2 in wl_data[scale]:
                            target_conf = conf.get_inst_detailed_conf(
                                record2.inst_type, format='list')
                            target_rank = record2.rank
                            target_jct = record2.jct
                            X = test_conf.copy()
                            X.extend(target_conf)
                            X.append(target_scale)
                            X.extend(test_metrics_vec)
                            train_data[rnd][t_inst_type]['X'].append(X)
                            if self.ordinal:
                                train_data[rnd][t_inst_type]['Y'].append(
                                    target_rank)
                            else:
                                train_data[rnd][t_inst_type]['Y'].append(
                                    target_jct)

        for rnd, rnd_data in rankize_data.items():
            for wl, wl_data in rnd_data.items():
                if not is_test_wl(wl): continue
                # wl_data = rnd_data[test_wl]
                for record1 in wl_data[train_scale]:
                    t_inst_type = record1.inst_type
                    test_conf = conf.get_inst_detailed_conf(t_inst_type,
                                                            format='list')
                    test_metrics_vec = fft_stat_encoder.encode(
                        record1.metrics,
                        record1.raw_metrics,
                        sampling_interval=self.sampling_interval)
                    for scale in predict_scales:
                        target_scale = conf.get_scale_id(scale)
                        for record2 in wl_data[scale]:
                            target_conf = conf.get_inst_detailed_conf(
                                record2.inst_type, format='list')
                            target_rank = record2.rank
                            target_jct = record2.jct
                            X = test_conf.copy()
                            X.extend(target_conf)
                            X.append(target_scale)
                            X.extend(test_metrics_vec)
                            test_data[wl][rnd][t_inst_type][scale]['X'].append(
                                X)
                            if self.ordinal:
                                test_data[wl][rnd][t_inst_type][scale][
                                    'Y'].append(target_rank)
                            else:
                                test_data[wl][rnd][t_inst_type][scale][
                                    'Y'].append(target_jct)

        return train_data, test_data