def get_expired_certificates(acm_client: object, logger: logging) -> defaultdict: """ Retrieves all expired certificates from the ACM client based on specified span of days. :param acm_client: ACM client :param logger: Logging object :return: A defaultdict of keys """ # defaultdict of all certificate ARNs and expiry dates certs = get_all_certificates(acm_client) expired_certs = defaultdict(default_value) for arn in certs: # Expiry datetime is certs[arn] # Compare expiry datetime with current date if (bool(certs) and certs[arn] != default_value): expiry_date = certs[arn] # Logs true if a certificate wil expire in specified number of days logger.info( f'Certificate will expire in {DAYS_EXP} days:{time_interval >= expiry_date}' ) if (time_interval) >= expiry_date: expired_certs[arn] = expiry_date else: logger.info( f'Certificate {arn} is still valid. The expiry date is {expiry_date}' ) return expired_certs
def execute(log: logging, config: dict): params = config['params'] with FTP(params['ftp_host']) as ftp: ts: datetime.datetime = config['ts'] ts_from = config['ts_from'] ts_to = config['ts_to'] log_types = params['log_types'] for log_type in log_types: log.info("connect success") ftp.login(params['ftp_user'], params['ftp_password']) log.info("auth success") in_path = ts.strftime(params['in_dir'].format(log_type)) out_path = ts.strftime(params['out_dir'].format(log_type)) chdir(ftp, in_path) files = ftp.nlst() bio = io.BytesIO() for in_file in files: s = re.search(r"(\d+)", in_file) if s: cur = datetime.datetime.strptime( s.group(), '%H%M%S').replace(day=ts.day, month=ts.month, year=ts.year) if (cur < ts_to) and (cur >= ts_from): ftp.retrbinary('RETR {0}'.format(in_file), bio.write) bio.write(b'\n') file_name = ts.strftime(params['in_file_template']) os.makedirs(out_path, exist_ok=True) out_file = os.path.join(out_path, file_name) with open(out_file, 'wb') as f: f.write(bio.getvalue())
def evalimage(net: Yolact, path: str, save_path: str = None, logger: logging = None, detections: Detections = None, image_id=None): frame = torch.from_numpy(cv2.imread(path)).float() if args.cuda: frame = frame.cuda().float() batch = FastBaseTransform()(frame.unsqueeze(0)) if cfg.flow.warp_mode != 'none': assert False, "Evaluating the image with a video-based model. If you believe this is a problem, please report a issue at GitHub, thanks." extras = { "backbone": "full", "interrupt": False, "keep_statistics": False, "moving_statistics": None } time_start = time.time() preds = net(batch, extras=extras)["pred_outs"] logger.info('Inference cost: %.3fs' % (time.time() - time_start)) img_numpy = prep_display(preds, frame, None, None, args, undo_transform=False) if args.output_coco_json: with timer.env('Postprocess'): _, _, h, w = batch.size() classes, scores, boxes, masks = \ postprocess(preds, w, h, crop_masks=args.crop, score_threshold=args.score_threshold) with timer.env('JSON Output'): boxes = boxes.cpu().numpy() masks = masks.view(-1, h, w).cpu().numpy() for i in range(masks.shape[0]): # Make sure that the bounding box actually makes sense and a mask was produced if (boxes[i, 3] - boxes[i, 1]) * (boxes[i, 2] - boxes[i, 0]) > 0: detections.add_bbox(image_id, classes[i], boxes[i, :], scores[i]) detections.add_mask(image_id, classes[i], masks[i, :, :], scores[i]) if save_path is None: img_numpy = img_numpy[:, :, (2, 1, 0)] if save_path is None: plt.imshow(img_numpy) plt.title(path) plt.show() else: cv2.imwrite(save_path, img_numpy)
def __init__(self, conn: sqlite3.Connection, logs: logging, schema_file: TextIO): self.conn = conn self.logs = logs self.schema = schema_file.read() try: conn.executescript(self.schema) logs.info("Database initialized from schema.sql") except sqlite3.Error: logs.error("Failed creating database from schema.sql")
def print_send_survey_command(logger: logging, chat_id: int, condition: int, survey_type: SurveyType) -> None: """ Logs informations about sended surveys. :param logger: logger instance :param chat_id: chat id of the user :param condition: condition of the user :param survey_type: current survey type :return: None """ logger.info("Send %s survey to %d with condition %d" % (survey_type.name, chat_id, condition))
def check_link( match_tuple: MatchTuple, http_session: requests.Session, logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]: reason: Optional[str] = None if match_tuple.link.startswith('http'): result_ok, reason = check_url(match_tuple, http_session) else: result_ok = check_path(match_tuple) if logger is None: print(f" {'✓' if result_ok else '✗'} {match_tuple.link}") else: logger.info(f" {'✓' if result_ok else '✗'} {match_tuple.link}") return match_tuple, result_ok, reason
def yml_reader(yml_filepath: str, logger: logging = None): """ Reading the yaml file. param: ymal_filepath: path to the yamlfile Return: Dictionary of yaml file contents """ logger = logger if logger is not None else logging.getLogger(__name__) if os.path.exists(yml_filepath): with open(yml_filepath) as stream: yml = YAML(typ="safe") yml_dict = yml.load(stream) return yml_dict else: logger.info(f"yml_filepath ({yml_filepath}) doesn't exisit")
def execute(spark: SparkSession, log: logging, config: dict): log.info("extract") params = config['params'] ps_conf = config['postgres'] ts: datetime.datetime = params['ts'] in_path = ts.strftime(params['in_path']) ts_from = config['ts_from'] ts_to = config['ts_to'] df = spark.read.csv(in_path, header=True, sep=';') df.select( F.col('FROM_PHONE_NUMBER'), F.col('TO_PHONE_NUMBER'), F.to_timestamp(df['START_TIME'], 'dd/MM/yyyy HH:mm:ss').alias('START_TIME'), F.col('CALL_DURATION').cast('long'), F.col('IMEI'), F.col('LOCATION') ).withColumn("TS", F.date_format(F.date_trunc("hour", "START_TIME"), "yyyy-MM-dd-HH")) df.write.partitionBy("TS").mode('append').format('hive').saveAsTable('task_02') df = spark.sql("select * from task_02 where TS >= {} AND TS < {}".format(ts_from, ts_to)).drop_duplicates() df.cache() ts = df.select("TS").rdd.map(lambda x: x[0]).first() # Number of call, total call duration. num_call = df.count() total_call_duration = list(df.select(F.sum(df['CALL_DURATION'])).first().asDict().values())[0] # Number of call in working hour (8am to 5pm) num_call_working_hour = df.filter("hour(START_TIME) >= 8 AND hour(START_TIME) <= 17").count() # Find the IMEI which make most call. imei_most = df.groupBy('IMEI').count().sort(F.col("count").desc()).first().asDict() # Find top 2 locations which make most call. locations = list(map(lambda x: x.asDict(), df.groupBy('LOCATION').count().sort(F.col("count").desc()).head(2))) rs = (ts, num_call, total_call_duration, num_call_working_hour, imei_most, locations) with get_postgres_cli(ps_conf) as ps_cli: with ps_cli.cursor() as cur: sql = """ INSERT INTO metric_hour( ts, num_call, total_call_duration, num_call_working_hour, imei_most, locations ) VALUES(%s, %s, %s, %s, %s, %s) ON CONFLICT (ts) DO UPDATE SET( num_call, total_call_duration, num_call_working_hour, imei_most, locations) = (EXCLUDED.num_call, EXCLUDED.total_call_duration, EXCLUDED.num_call_working_hour EXCLUDED.imei_most, EXCLUDED.locations) """ cur.execute(sql, rs)
def knobs_ranking(knob_data: dict, metric_data: dict, mode: str, logger: logging) -> list: """ knob_data : will be ranked by knobs_ranking metric_data : pruned metric_data by metric simplification mode : selct knob_identification(like lasso, xgb, rf) logger """ knob_matrix: list = knob_data['data'] knob_columnlabels: list = knob_data['columnlabels'] metric_matrix: list = metric_data['data'] #metric_columnlabels = metric_data['columnlabels'] encoded_knob_columnlabels = knob_columnlabels encoded_knob_matrix = knob_matrix # standardize values in each column to N(0, 1) #standardizer = RobustScaler() standardizer = StandardScaler() # standardizer = MinMaxScaler() standardized_knob_matrix = standardizer.fit_transform(encoded_knob_matrix) standardized_metric_matrix = standardizer.fit_transform(metric_matrix) # shuffle rows (note: same shuffle applied to both knob and metric matrices) shuffle_indices = get_shuffle_indices(standardized_knob_matrix.shape[0], seed=17) shuffled_knob_matrix = standardized_knob_matrix[shuffle_indices, :] shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :] model = Ranking(mode) model.fit(shuffled_knob_matrix, shuffled_metric_matrix, encoded_knob_columnlabels) encoded_knobs = model.get_ranked_features() feature_imp = model.get_ranked_importance() if feature_imp is None: pass else: logger.info('Feature importance') logger.info(feature_imp) consolidated_knobs = consolidate_columnlabels(encoded_knobs) return consolidated_knobs
def inject_link(html: str, href: str, page: Page, logger: logging) -> str: """Adding PDF View button on navigation bar(using material theme)""" def _pdf_icon(): _ICON = ''' <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512"> <path d="M128,0c-17.6,0-32,14.4-32,32v448c0,17.6,14.4,32,32,32h320c17.6,0,32-14.4,32-32V128L352,0H128z" fill="#E2E5E7"/> <path d="m384 128h96l-128-128v96c0 17.6 14.4 32 32 32z" fill="#B0B7BD"/> <polygon points="480 224 384 128 480 128" fill="#CAD1D8"/> <path d="M416,416c0,8.8-7.2,16-16,16H48c-8.8,0-16-7.2-16-16V256c0-8.8,7.2-16,16-16h352c8.8,0,16,7.2,16,16 V416z" fill="#F15642"/> <g fill="#fff"> <path d="m101.74 303.15c0-4.224 3.328-8.832 8.688-8.832h29.552c16.64 0 31.616 11.136 31.616 32.48 0 20.224-14.976 31.488-31.616 31.488h-21.36v16.896c0 5.632-3.584 8.816-8.192 8.816-4.224 0-8.688-3.184-8.688-8.816v-72.032zm16.88 7.28v31.872h21.36c8.576 0 15.36-7.568 15.36-15.504 0-8.944-6.784-16.368-15.36-16.368h-21.36z"/> <path d="m196.66 384c-4.224 0-8.832-2.304-8.832-7.92v-72.672c0-4.592 4.608-7.936 8.832-7.936h29.296c58.464 0 57.184 88.528 1.152 88.528h-30.448zm8.064-72.912v57.312h21.232c34.544 0 36.08-57.312 0-57.312h-21.232z"/> <path d="m303.87 312.11v20.336h32.624c4.608 0 9.216 4.608 9.216 9.072 0 4.224-4.608 7.68-9.216 7.68h-32.624v26.864c0 4.48-3.184 7.92-7.664 7.92-5.632 0-9.072-3.44-9.072-7.92v-72.672c0-4.592 3.456-7.936 9.072-7.936h44.912c5.632 0 8.96 3.344 8.96 7.936 0 4.096-3.328 8.704-8.96 8.704h-37.248v0.016z"/> </g> <path d="m400 432h-304v16h304c8.8 0 16-7.2 16-16v-16c0 8.8-7.2 16-16 16z" fill="#CAD1D8"/> </svg> ''' # noqa: E501 return BeautifulSoup(_ICON, 'html.parser') logger.info('(hook on inject_link: %s)', page.title) soup = BeautifulSoup(html, 'html.parser') nav = soup.find(class_='md-header-nav') if not nav: # after 7.x nav = soup.find('nav', class_='md-header__inner') if nav: a = soup.new_tag('a', href=href, title='PDF', **{'class': 'md-header-nav__button md-icon'}) a.append(_pdf_icon()) nav.append(a) return str(soup) return html
def log_section(text: str, logger: logging) -> None: """ Prints a section. :param text: text to print :param logger: logger object """ logger.info( "==============================================================") logger.info(text) logger.info( "==============================================================")
def execute(spark: SparkSession, log: logging, config: dict): log.info("extract") in_path = config['params']['in_path'] out_path = config['params']['out_path'] df = spark.read.csv(in_path, header=True).repartition(120, "PHONE_NUMBER").na.fill( {'DEACTIVATION_DATE': '9999-12-31'}) log.info("transform") df_norm = df.sort(df.DEACTIVATION_DATE.desc()).groupby( ['PHONE_NUMBER'] ).agg( F.collect_list(df['ACTIVATION_DATE']).alias('ACTIVATION_DATE'), F.collect_list(df['DEACTIVATION_DATE']).alias('DEACTIVATION_DATE') ).withColumn( 'ACTUAL_ACTIVE_DATE', udf_actual_active_date(F.col('ACTIVATION_DATE'), F.col('DEACTIVATION_DATE')) ).select(['PHONE_NUMBER', 'ACTUAL_ACTIVE_DATE']).withColumn( "TS", F.date_format(F.date_trunc("month", "ACTUAL_ACTIVE_DATE"), "yyyy-MM")) log.info("load") df_norm.write.partitionBy("TS").parquet(out_path, mode="overwrite") spark.read.parquet(out_path)
def data_preprocessing(target_num: int, persistence: str, logger: logging) -> Tuple[dict, dict, dict, dict]: """ workload{2~18} = workload datas composed of different key(workload2, workload3, ...) [N of configs, N of columnlabels] columnlabels = Internal Metric names rowlabels = Index for Workload data internal_metric_datas = { 'workload{2~18} except target(1)'=array([[1,2,3,...], [2,3,4,...], ...[]]) 'columnlabels'=array(['IM_1', 'IM_2', ...]), 'rowlabels'=array([1, 2, ..., 10000])} """ """ data = concat((workload2,...,workload18)) length = 10000 * N of workload columnlabels = same as internal_metric_datas's columnlabels rowlabels = same as internal_metric_datas's rowlabels aggregated_IM_data = { 'data'=array([[1,2,3,...], [2,3,4,...], ...[]]) 'columnlabels'=array(['IM_1', 'IM_2', ...]), 'rowlabels'=array([1, 2, ..., 10000])} """ knobs_path: str = os.path.join(DATA_PATH, "configs") # if persistence == "RDB": # knob_data, _ = knobs.load_knobs(knobs_path) # elif persistence == "AOF": # _, knob_data = knobs.load_knobs(knobs_path) # logger.info("Finish Load Knob Data") internal_metric_datas = defaultdict(list) ops_metric_datas = {} latency_metric_datas = {} knob_datas = {} # len()-1 because of configs dir for i in range(1, len(os.listdir(DATA_PATH))): if target_num == i: ops_target_external_data: dict = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f"result_{persistence.lower()}_external_{i}.csv"), knobs_path=knobs_path, metrics=['Totals_Ops/sec']) latency_target_external_data: dict = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f"result_{persistence.lower()}_external_{i}.csv"), knobs_path=knobs_path, metrics=['Totals_p99_Latency']) target_knob_data, _ = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f'result_{persistence.lower()}_internal_{i}.csv'), knobs_path=knobs_path, persistence=persistence, ) else: knob_data, internal_metric_data = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f'result_{persistence.lower()}_internal_{i}.csv'), knobs_path=knobs_path, persistence=persistence, ) ops_metric_data: dict = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f'result_{persistence.lower()}_external_{i}.csv'), knobs_path=knobs_path, metrics=['Totals_Ops/sec']) latency_metric_data: dict = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f'result_{persistence.lower()}_external_{i}.csv'), knobs_path=knobs_path, metrics=['Totals_p99_Latency']) knob_datas[f'workload{i}'] = knob_data['data'] internal_metric_datas[f'workload{i}'] = internal_metric_data[ 'data'] ops_metric_datas[f'workload{i}'] = ops_metric_data['data'] latency_metric_datas[f'workload{i}'] = latency_metric_data['data'] internal_metric_datas['rowlabels'].extend(knob_data['rowlabels']) #for all train split #external_metric_datas[f'workload{target_num}'] = target_external_data['data'] knob_datas['columnlabels'] = knob_data['columnlabels'] internal_metric_datas['columnlabels'] = internal_metric_data[ 'columnlabels'] ops_metric_datas['columnlabels'] = ['Totals_Ops/sec'] latency_metric_datas['columnlabels'] = ['Totals_p99_Latency'] logger.info("Finish Load Internal and External Metrics Data") aggregated_IM_data: dict = knobs.aggregate_datas(internal_metric_datas) aggregated_ops_data: dict = knobs.aggregate_datas(ops_metric_datas) aggregated_latency_data: dict = knobs.aggregate_datas(latency_metric_datas) aggregated_knob_data: dict = knobs.aggregate_datas(knob_datas) return aggregated_knob_data, aggregated_IM_data, aggregated_ops_data, aggregated_latency_data, target_knob_data, ops_target_external_data, latency_target_external_data
def log_and_print(message, logger: logging = None) -> None: if logger is not None: logger.info(message) print(message)
def main(opt: argparse, logger: logging, log_dir: str) -> Config: #Target workload loading logger.info( f"====================== {opt.persistence} mode ====================\n" ) logger.info(f"Target workload name is {opt.target}") knob_data, aggregated_IM_data, aggregated_EM_data, target_knob_data, target_external_data = data_preprocessing( opt.target, opt.persistence, logger) logger.info( "====================== Metrics_Simplification ====================\n") pruned_metrics = metric_simplification(aggregated_IM_data, logger, opt) logger.info( f"Done pruning metrics for workload {opt.persistence} (# of pruned metrics: {len(pruned_metrics)}).\n\n" f"Pruned metrics: {pruned_metrics}\n") metric_idxs = [ i for i, metric_name in enumerate(aggregated_IM_data['columnlabels']) if metric_name in pruned_metrics ] ranked_metric_data = { 'data': aggregated_IM_data['data'][:, metric_idxs], 'rowlabels': copy.deepcopy(aggregated_IM_data['rowlabels']), 'columnlabels': [aggregated_IM_data['columnlabels'][i] for i in metric_idxs] } ### KNOBS RANKING STAGE ### rank_knob_data = copy.deepcopy(knob_data) logger.info( "====================== Run_Knobs_Ranking ====================\n") logger.info(f"use mode = {opt.rki}") ranked_knobs = knobs_ranking(knob_data=rank_knob_data, metric_data=ranked_metric_data, mode=opt.rki, logger=logger) logger.info( f"Done ranking knobs for workload {opt.persistence} (# ranked knobs: {len(ranked_knobs)}).\n\n" f"Ranked knobs: {ranked_knobs}\n") top_k: int = opt.topk top_k_knobs = utils.get_ranked_knob_data(ranked_knobs, knob_data, top_k) target_knobs = utils.get_ranked_knob_data(ranked_knobs, target_knob_data, top_k) knob_save_path = utils.make_date_dir('./save_knobs') logger.info(f"Knob save path : {knob_save_path}") logger.info(f"Choose Top {top_k} knobs : {top_k_knobs['columnlabels']}") np.save(os.path.join(knob_save_path, f'knobs_{top_k}.npy'), np.array(top_k_knobs['columnlabels'])) model, optimizer, trainDataloader, valDataloader, testDataloader, scaler_y = prepare_for_training( opt, top_k_knobs, target_knobs, aggregated_EM_data, target_external_data) logger.info( f"====================== {opt.model_mode} Pre-training Stage ====================\n" ) best_epoch, best_th_loss, best_la_loss, best_th_mae_loss, best_la_mae_loss, model_path = train( model, trainDataloader, valDataloader, testDataloader, optimizer, scaler_y, opt, logger) logger.info( f"\n\n[Best Epoch {best_epoch}] Best_th_Loss : {best_th_loss} Best_la_Loss : {best_la_loss} Best_th_MAE : {best_th_mae_loss} Best_la_MAE : {best_la_mae_loss}" ) config = Config(opt.persistence, opt.db, opt.cluster, opt.rki, opt.topk, opt.model_mode, opt.n_epochs, opt.lr) config.save_results(opt.target, best_epoch, best_th_loss, best_la_loss, best_th_mae_loss, best_la_mae_loss, model_path, log_dir, knob_save_path) return config
def processItem(c: Checker, l: logging): l.info('started thread {}'.format(c.websiteUrl)) while True: c.process() c.wait()
def processItem(thread_id, writer: Writer, logger: logging): logger.info('started thread {}'.format(thread_id)) while True: writer.process()
def data_preprocessing(target_num: int, persistence: str, logger: logging) -> Tuple[dict, dict, dict, dict, dict]: """ workload{2~18} = workload datas composed of different key(workload2, workload3, ...) [N of configs, N of columnlabels] columnlabels = Internal Metric names rowlabels = Index for Workload data """ target_DATA_PATH = "../data/redis_data/workload{}".format(target_num) knobs_path: str = os.path.join(DATA_PATH, "configs") internal_metric_datas = defaultdict(list) external_metric_datas = {} knob_datas = {} for i in range(1, 19): if target_num == i: target_external_data: dict = knobs.load_knob_metrics( metric_path=os.path.join( target_DATA_PATH, f"result_{persistence.lower()}_external_{i}.csv"), knobs_path=knobs_path, metrics=['Totals_Ops/sec', 'Totals_p99_Latency']) target_knob_data, _ = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f'result_{persistence.lower()}_internal_{i}.csv'), knobs_path=knobs_path, persistence=persistence, ) else: knob_data, internal_metric_data = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f'result_{persistence.lower()}_internal_{i}.csv'), knobs_path=knobs_path, persistence=persistence, ) external_metric_data: dict = knobs.load_knob_metrics( metric_path=os.path.join( DATA_PATH, f'workload{i}', f'result_{persistence.lower()}_external_{i}.csv'), knobs_path=knobs_path, metrics=['Totals_Ops/sec', 'Totals_p99_Latency']) assert knob_data['data'] != external_metric_data['data'], (len( knob_data['data']), len(external_metric_data['data'])) knob_datas[f'workload{i}'] = knob_data['data'] internal_metric_datas[f'workload{i}'] = internal_metric_data[ 'data'] external_metric_datas[f'workload{i}'] = external_metric_data[ 'data'] internal_metric_datas['rowlabels'].extend(knob_data['rowlabels']) knob_datas['columnlabels'] = knob_data['columnlabels'] internal_metric_datas['columnlabels'] = internal_metric_data[ 'columnlabels'] external_metric_datas['columnlabels'] = [ 'Totals_Ops/sec', 'Totals_p99_Latency' ] logger.info("Finish Load knob and Internal and External Metrics Data") aggregated_IM_data: dict = knobs.aggregate_datas(internal_metric_datas) aggregated_EM_data: dict = knobs.aggregate_datas(external_metric_datas) aggregated_knob_data: dict = knobs.aggregate_datas(knob_datas) return aggregated_knob_data, aggregated_IM_data, aggregated_EM_data,\ target_knob_data, target_external_data
def metric_simplification(metric_data: dict, logger: logging, args: argparse) -> list: matrix: list = metric_data['data'] columnlabels: list = metric_data['columnlabels'] # Remove any constant columns nonconst_matrix = [] nonconst_columnlabels = [] for col, (_, v) in zip(matrix.T, enumerate(columnlabels)): if np.any(col != col[0]): nonconst_matrix.append(col.reshape(-1, 1)) nonconst_columnlabels.append(v) assert len(nonconst_matrix) > 0, "Need more data to train the model" nonconst_matrix = np.hstack(nonconst_matrix) logger.info( f"Workload characterization ~ nonconst data size: {nonconst_matrix.shape}" ) # Remove any duplicate columns unique_matrix, unique_idxs = np.unique(nonconst_matrix, axis=1, return_index=True) unique_columnlabels = [nonconst_columnlabels[idx] for idx in unique_idxs] logger.info( f"Workload characterization ~ final data size: {unique_matrix.shape}") n_rows, n_cols = unique_matrix.shape # Shuffle the matrix rows shuffle_indices = get_shuffle_indices(n_rows) shuffled_matrix: list = unique_matrix[shuffle_indices, :] #shuffled_matrix = RobustScaler().fit_transform(shuffled_matrix) shuffled_matrix = StandardScaler().fit_transform(shuffled_matrix) # shuffled_matrix = MinMaxScaler().fit_transform(shuffled_matrix) #FactorAnalysis fa_model = FactorAnalysis() fa_model.fit(shuffled_matrix, unique_columnlabels, n_components=5) # Components: metrics * factors components = fa_model.components_.T.copy() # Clustering method : Gaussian Mixture Model(GMM) logger.info("Clustering mode : {}".format(args.cluster)) if args.cluster == 'gmm': cluster = GMMClustering(components) cluster.fit(components) pruned_metrics = cluster.get_closest_samples(unique_columnlabels) logger.info(f"Found optimal number of clusters: {cluster.optimK}") elif args.cluster == 'k-means': #KMeansClusters() kmeans_models = KMeansClusters() ##TODO: Check Those Options kmeans_models.fit(components, min_cluster=1, max_cluster=min(n_cols - 1, 20), sample_labels=unique_columnlabels, estimator_params={'n_init': 100}) gapk = create_kselection_model("gap-statistic") gapk.fit(components, kmeans_models.cluster_map_) logger.info( f"Found optimal number of clusters: {gapk.optimal_num_clusters_}") # Get pruned metrics, cloest samples of each cluster center pruned_metrics = kmeans_models.cluster_map_[ gapk.optimal_num_clusters_].get_closest_samples() # Clustering method : Mean Shift elif args.cluster == 'ms': ms = MeanShiftClustering(components) ms.fit(components) pruned_metrics = ms.get_closest_samples(unique_columnlabels) logger.info(f"Found optimal number of clusters: {len(ms.centroid)}") return pruned_metrics
def main(opt: argparse, logger: logging, log_dir: str) -> Config: # Target workload loading logger.info("====================== {} mode ====================\n".format( opt.persistence)) logger.info("Target workload name is {}".format(opt.target)) """ load knob data and IM datas, EM datas. """ ### data load ### knob_data, aggregated_IM_data, aggregated_ops_data, aggregated_latency_data, target_knob_data, ops_target_external_data, latency_target_external_data = data_preprocessing( opt.target, opt.persistence, logger) ### clustering ### logger.info( "====================== Metrics_Simplification ====================\n") pruned_metrics = metric_simplification(aggregated_IM_data, logger, opt) logger.info("Done pruning metrics for workload {} (# of pruned metrics: {}).\n\n""Pruned metrics: {}\n".format( opt.persistence, len(pruned_metrics), pruned_metrics)) metric_idxs = [i for i, metric_name in enumerate( aggregated_IM_data['columnlabels']) if metric_name in pruned_metrics] ranked_metric_data = { 'data': aggregated_IM_data['data'][:, metric_idxs], 'rowlabels': copy.deepcopy(aggregated_IM_data['rowlabels']), 'columnlabels': [aggregated_IM_data['columnlabels'][i] for i in metric_idxs] } """ For example, pruned_metrics : ['allocator_rss_bytes', 'rss_overhead_bytes', 'used_memory_dataset', 'rdb_last_cow_size'] """ ### KNOBS RANKING STAGE ### rank_knob_data = copy.deepcopy(knob_data) logger.info( "====================== Run_Knobs_Ranking ====================\n") logger.info("use mode = {}".format(opt.rki)) ranked_knobs = knobs_ranking(knob_data=rank_knob_data, metric_data=ranked_metric_data, mode=opt.rki, logger=logger) logger.info("Done ranking knobs for workload {} (# ranked knobs: {}).\n\n" "Ranked knobs: {}\n".format(opt.persistence, len(ranked_knobs), ranked_knobs)) top_k: dict = opt.topk top_k_knobs = utils.get_ranked_knob_data(ranked_knobs, knob_data, top_k) target_knobs = utils.get_ranked_knob_data( ranked_knobs, target_knob_data, top_k) knob_save_path = utils.make_date_dir('./save_knobs') logger.info("Knob save path : {}".format(knob_save_path)) logger.info("Choose Top {} knobs : {}".format( top_k, top_k_knobs['columnlabels'])) np.save(os.path.join(knob_save_path, 'knobs_{}.npy'.format(top_k)), np.array(top_k_knobs['columnlabels'])) # In double version aggregated_data = [aggregated_ops_data, aggregated_latency_data] target_external_data = [ ops_target_external_data, latency_target_external_data] if not opt.atr: model, optimizer = set_model(opt) model_save_path = utils.make_date_dir("./model_save") logger.info("Model save path : {}".format(model_save_path)) logger.info("Learning Rate : {}".format(opt.lr)) best_epoch, best_loss, best_mae = defaultdict( int), defaultdict(float), defaultdict(float) columns = ['Totals_Ops/sec', 'Totals_p99_Latency'] ### train dnn ### for i in range(2): trainDataloader, valDataloader, testDataloader, scaler_y = prepareForTraining( opt, top_k_knobs, target_knobs, aggregated_data[i], target_external_data[i], i) logger.info( "====================== {} Pre-training Stage ====================\n".format(opt.model_mode)) best_epoch[columns[i]], best_loss[columns[i]], best_mae[columns[i]] = train( model, trainDataloader, valDataloader, testDataloader, optimizer, scaler_y, opt, logger, model_save_path, i) for name in best_epoch.keys(): logger.info("\n\n[{} Best Epoch {}] Best_Loss : {} Best_MAE : {}".format( name, best_epoch[name], best_loss[name], best_mae[name])) config = Config(opt.persistence, opt.db, opt.cluster, opt.rki, opt.topk, opt.model_mode, opt.n_epochs, opt.lr) config.save_double_results(opt.target, best_epoch['Totals_Ops/sec'], best_epoch[name], best_loss['Totals_Ops/sec'], best_loss[name], best_mae['Totals_Ops/sec'], best_mae[name], model_save_path, log_dir, knob_save_path) return config else: models = set_rf_model() for i in range(2): X_tr, y_train = prepare_ATR_learning( opt, top_k_knobs, target_knobs, aggregated_data[i], target_external_data[i], i) models[i].fit(X_tr, y_train) pruned_configs, external_datas, defaults, scaler_X, scaler_ys = double_prepareForGA(opt, top_k_knobs['columnlabels']) current_solution_pools, targets = make_solution_pool(opt, pruned_configs, external_datas, defaults) fitness_function = RF_fitness n_configs = top_k_knobs['columnlabels'].shape[0] #set remain ratio n_pool_half = opt.n_pool//2 #mutation ratio mutation = int(n_configs*0.5) GA_options = [n_configs, n_pool_half, mutation] top_k_config_path, name, connect = ATR_GA(opt, models, targets, top_k_knobs, current_solution_pools, fitness_function, GA_options, scaler_X, scaler_ys, logger) if connect: server_connection(opt, top_k_config_path, name) else: logger.info("Because appednfsync is 'always', Fin GA") return 0 import datetime #save results i = 0 today = datetime.datetime.now() name = 'result_'+opt.persistence+'-'+today.strftime('%Y%m%d')+'-'+'%02d'%i+'.csv' while os.path.exists(os.path.join('./GA_config/', name)): i += 1 name = 'result_'+opt.persistence+'-'+today.strftime('%Y%m%d')+'-'+'%02d'%i+'.csv' os.rename(f'./GA_config/result_{opt.persistence.lower()}_external_GA.csv', './GA_config/'+name) logger.info(name) df = pd.read_csv('./GA_config/'+name) logger.info(df["Totals_Ops/sec"]) logger.info(df["Totals_p99_Latency"])
def register(dp: Dispatcher, logger: logging) -> bool: dp.register_message_handler(callback=start, commands=["start"]) dp.register_message_handler(callback=result_rust, commands=["check"]) dp.register_message_handler(callback=all, text="1") logger.info("end reg") return True