Ejemplo n.º 1
0
def get_expired_certificates(acm_client: object,
                             logger: logging) -> defaultdict:
    """
    Retrieves all expired certificates from the ACM client based on specified span of days.
    :param acm_client: ACM client
    :param logger: Logging object
    :return: A defaultdict of keys
    """
    # defaultdict of all certificate ARNs and expiry dates
    certs = get_all_certificates(acm_client)
    expired_certs = defaultdict(default_value)
    for arn in certs:
        # Expiry datetime is certs[arn]
        # Compare expiry datetime with current date
        if (bool(certs) and certs[arn] != default_value):
            expiry_date = certs[arn]
        # Logs true if a certificate wil expire in specified number of days
        logger.info(
            f'Certificate will expire in {DAYS_EXP} days:{time_interval >= expiry_date}'
        )
        if (time_interval) >= expiry_date:
            expired_certs[arn] = expiry_date
        else:
            logger.info(
                f'Certificate {arn} is still valid. The expiry date is {expiry_date}'
            )
    return expired_certs
Ejemplo n.º 2
0
def execute(log: logging, config: dict):
    params = config['params']
    with FTP(params['ftp_host']) as ftp:
        ts: datetime.datetime = config['ts']
        ts_from = config['ts_from']
        ts_to = config['ts_to']
        log_types = params['log_types']
        for log_type in log_types:
            log.info("connect success")
            ftp.login(params['ftp_user'], params['ftp_password'])
            log.info("auth success")
            in_path = ts.strftime(params['in_dir'].format(log_type))
            out_path = ts.strftime(params['out_dir'].format(log_type))
            chdir(ftp, in_path)
            files = ftp.nlst()
            bio = io.BytesIO()
            for in_file in files:
                s = re.search(r"(\d+)", in_file)
                if s:
                    cur = datetime.datetime.strptime(
                        s.group(), '%H%M%S').replace(day=ts.day,
                                                     month=ts.month,
                                                     year=ts.year)
                    if (cur < ts_to) and (cur >= ts_from):
                        ftp.retrbinary('RETR {0}'.format(in_file), bio.write)
                        bio.write(b'\n')
            file_name = ts.strftime(params['in_file_template'])
            os.makedirs(out_path, exist_ok=True)
            out_file = os.path.join(out_path, file_name)
            with open(out_file, 'wb') as f:
                f.write(bio.getvalue())
Ejemplo n.º 3
0
def evalimage(net: Yolact,
              path: str,
              save_path: str = None,
              logger: logging = None,
              detections: Detections = None,
              image_id=None):
    frame = torch.from_numpy(cv2.imread(path)).float()
    if args.cuda:
        frame = frame.cuda().float()
    batch = FastBaseTransform()(frame.unsqueeze(0))

    if cfg.flow.warp_mode != 'none':
        assert False, "Evaluating the image with a video-based model. If you believe this is a problem, please report a issue at GitHub, thanks."

    extras = {
        "backbone": "full",
        "interrupt": False,
        "keep_statistics": False,
        "moving_statistics": None
    }

    time_start = time.time()
    preds = net(batch, extras=extras)["pred_outs"]
    logger.info('Inference cost: %.3fs' % (time.time() - time_start))

    img_numpy = prep_display(preds,
                             frame,
                             None,
                             None,
                             args,
                             undo_transform=False)

    if args.output_coco_json:
        with timer.env('Postprocess'):
            _, _, h, w = batch.size()
            classes, scores, boxes, masks = \
                postprocess(preds, w, h, crop_masks=args.crop, score_threshold=args.score_threshold)

        with timer.env('JSON Output'):
            boxes = boxes.cpu().numpy()
            masks = masks.view(-1, h, w).cpu().numpy()
            for i in range(masks.shape[0]):
                # Make sure that the bounding box actually makes sense and a mask was produced
                if (boxes[i, 3] - boxes[i, 1]) * (boxes[i, 2] -
                                                  boxes[i, 0]) > 0:
                    detections.add_bbox(image_id, classes[i], boxes[i, :],
                                        scores[i])
                    detections.add_mask(image_id, classes[i], masks[i, :, :],
                                        scores[i])

    if save_path is None:
        img_numpy = img_numpy[:, :, (2, 1, 0)]

    if save_path is None:
        plt.imshow(img_numpy)
        plt.title(path)
        plt.show()
    else:
        cv2.imwrite(save_path, img_numpy)
Ejemplo n.º 4
0
 def __init__(self, conn: sqlite3.Connection, logs: logging,
              schema_file: TextIO):
     self.conn = conn
     self.logs = logs
     self.schema = schema_file.read()
     try:
         conn.executescript(self.schema)
         logs.info("Database initialized from schema.sql")
     except sqlite3.Error:
         logs.error("Failed creating database from schema.sql")
Ejemplo n.º 5
0
    def print_send_survey_command(logger: logging,
                                  chat_id: int,
                                  condition: int,
                                  survey_type: SurveyType) -> None:
        """
        Logs informations about sended surveys.

        :param logger: logger instance
        :param chat_id: chat id of the user
        :param condition: condition of the user
        :param survey_type: current survey type
        :return: None
        """
        logger.info("Send %s survey to %d with condition %d" % (survey_type.name, chat_id, condition))
Ejemplo n.º 6
0
def check_link(
        match_tuple: MatchTuple,
        http_session: requests.Session,
        logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]:
    reason: Optional[str] = None
    if match_tuple.link.startswith('http'):
        result_ok, reason = check_url(match_tuple, http_session)
    else:
        result_ok = check_path(match_tuple)
    if logger is None:
        print(f"  {'✓' if result_ok else '✗'} {match_tuple.link}")
    else:
        logger.info(f"  {'✓' if result_ok else '✗'} {match_tuple.link}")
    return match_tuple, result_ok, reason
Ejemplo n.º 7
0
def yml_reader(yml_filepath: str, logger: logging = None):
    """
    Reading the yaml file.

    param: ymal_filepath: path to the yamlfile

    Return: Dictionary of yaml file contents
    """
    logger = logger if logger is not None else logging.getLogger(__name__)
    if os.path.exists(yml_filepath):
        with open(yml_filepath) as stream:
            yml = YAML(typ="safe")
            yml_dict = yml.load(stream)
        return yml_dict
    else:
        logger.info(f"yml_filepath ({yml_filepath}) doesn't exisit")
def execute(spark: SparkSession, log: logging, config: dict):
    log.info("extract")
    params = config['params']
    ps_conf = config['postgres']

    ts: datetime.datetime = params['ts']
    in_path = ts.strftime(params['in_path'])
    ts_from = config['ts_from']
    ts_to = config['ts_to']
    df = spark.read.csv(in_path, header=True, sep=';')
    df.select(
        F.col('FROM_PHONE_NUMBER'), F.col('TO_PHONE_NUMBER'),
        F.to_timestamp(df['START_TIME'], 'dd/MM/yyyy HH:mm:ss').alias('START_TIME'),
        F.col('CALL_DURATION').cast('long'), F.col('IMEI'), F.col('LOCATION')
    ).withColumn("TS", F.date_format(F.date_trunc("hour", "START_TIME"), "yyyy-MM-dd-HH"))
    df.write.partitionBy("TS").mode('append').format('hive').saveAsTable('task_02')
    df = spark.sql("select * from task_02 where TS >= {} AND TS < {}".format(ts_from, ts_to)).drop_duplicates()
    df.cache()
    ts = df.select("TS").rdd.map(lambda x: x[0]).first()
    # Number of call, total call duration.
    num_call = df.count()
    total_call_duration = list(df.select(F.sum(df['CALL_DURATION'])).first().asDict().values())[0]

    # Number of call in working hour (8am to 5pm)
    num_call_working_hour = df.filter("hour(START_TIME) >= 8 AND hour(START_TIME) <= 17").count()

    # Find the IMEI which make most call.
    imei_most = df.groupBy('IMEI').count().sort(F.col("count").desc()).first().asDict()

    # Find top 2 locations which make most call.
    locations = list(map(lambda x: x.asDict(), df.groupBy('LOCATION').count().sort(F.col("count").desc()).head(2)))

    rs = (ts, num_call, total_call_duration, num_call_working_hour, imei_most, locations)
    with get_postgres_cli(ps_conf) as ps_cli:
        with ps_cli.cursor() as cur:
            sql = """
            INSERT INTO metric_hour(
                ts, num_call, total_call_duration, 
                num_call_working_hour, imei_most, locations
            ) VALUES(%s, %s, %s, %s, %s, %s) 
            ON CONFLICT (ts) 
            DO UPDATE SET(
                num_call, total_call_duration, num_call_working_hour, imei_most, locations) = 
                (EXCLUDED.num_call, EXCLUDED.total_call_duration, EXCLUDED.num_call_working_hour
                 EXCLUDED.imei_most, EXCLUDED.locations)
            """
            cur.execute(sql, rs)
Ejemplo n.º 9
0
def knobs_ranking(knob_data: dict, metric_data: dict, mode: str,
                  logger: logging) -> list:
    """
    knob_data : will be ranked by knobs_ranking
    metric_data : pruned metric_data by metric simplification
    mode : selct knob_identification(like lasso, xgb, rf)
    logger
    """
    knob_matrix: list = knob_data['data']
    knob_columnlabels: list = knob_data['columnlabels']

    metric_matrix: list = metric_data['data']
    #metric_columnlabels = metric_data['columnlabels']

    encoded_knob_columnlabels = knob_columnlabels
    encoded_knob_matrix = knob_matrix

    # standardize values in each column to N(0, 1)
    #standardizer = RobustScaler()
    standardizer = StandardScaler()
    # standardizer = MinMaxScaler()
    standardized_knob_matrix = standardizer.fit_transform(encoded_knob_matrix)
    standardized_metric_matrix = standardizer.fit_transform(metric_matrix)

    # shuffle rows (note: same shuffle applied to both knob and metric matrices)
    shuffle_indices = get_shuffle_indices(standardized_knob_matrix.shape[0],
                                          seed=17)
    shuffled_knob_matrix = standardized_knob_matrix[shuffle_indices, :]
    shuffled_metric_matrix = standardized_metric_matrix[shuffle_indices, :]

    model = Ranking(mode)
    model.fit(shuffled_knob_matrix, shuffled_metric_matrix,
              encoded_knob_columnlabels)
    encoded_knobs = model.get_ranked_features()
    feature_imp = model.get_ranked_importance()
    if feature_imp is None:
        pass
    else:
        logger.info('Feature importance')
        logger.info(feature_imp)

    consolidated_knobs = consolidate_columnlabels(encoded_knobs)

    return consolidated_knobs
Ejemplo n.º 10
0
def inject_link(html: str, href: str, page: Page, logger: logging) -> str:
    """Adding PDF View button on navigation bar(using material theme)"""
    def _pdf_icon():
        _ICON = '''
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512">
<path d="M128,0c-17.6,0-32,14.4-32,32v448c0,17.6,14.4,32,32,32h320c17.6,0,32-14.4,32-32V128L352,0H128z" fill="#E2E5E7"/>
<path d="m384 128h96l-128-128v96c0 17.6 14.4 32 32 32z" fill="#B0B7BD"/>
<polygon points="480 224 384 128 480 128" fill="#CAD1D8"/>
<path d="M416,416c0,8.8-7.2,16-16,16H48c-8.8,0-16-7.2-16-16V256c0-8.8,7.2-16,16-16h352c8.8,0,16,7.2,16,16  V416z" fill="#F15642"/>
<g fill="#fff">
<path d="m101.74 303.15c0-4.224 3.328-8.832 8.688-8.832h29.552c16.64 0 31.616 11.136 31.616 32.48 0 20.224-14.976 31.488-31.616 31.488h-21.36v16.896c0 5.632-3.584 8.816-8.192 8.816-4.224 0-8.688-3.184-8.688-8.816v-72.032zm16.88 7.28v31.872h21.36c8.576 0 15.36-7.568 15.36-15.504 0-8.944-6.784-16.368-15.36-16.368h-21.36z"/>
<path d="m196.66 384c-4.224 0-8.832-2.304-8.832-7.92v-72.672c0-4.592 4.608-7.936 8.832-7.936h29.296c58.464 0 57.184 88.528 1.152 88.528h-30.448zm8.064-72.912v57.312h21.232c34.544 0 36.08-57.312 0-57.312h-21.232z"/>
<path d="m303.87 312.11v20.336h32.624c4.608 0 9.216 4.608 9.216 9.072 0 4.224-4.608 7.68-9.216 7.68h-32.624v26.864c0 4.48-3.184 7.92-7.664 7.92-5.632 0-9.072-3.44-9.072-7.92v-72.672c0-4.592 3.456-7.936 9.072-7.936h44.912c5.632 0 8.96 3.344 8.96 7.936 0 4.096-3.328 8.704-8.96 8.704h-37.248v0.016z"/>
</g>
<path d="m400 432h-304v16h304c8.8 0 16-7.2 16-16v-16c0 8.8-7.2 16-16 16z" fill="#CAD1D8"/>
</svg>
'''  # noqa: E501
        return BeautifulSoup(_ICON, 'html.parser')

    logger.info('(hook on inject_link: %s)', page.title)
    soup = BeautifulSoup(html, 'html.parser')

    nav = soup.find(class_='md-header-nav')
    if not nav:
        # after 7.x
        nav = soup.find('nav', class_='md-header__inner')
    if nav:
        a = soup.new_tag('a',
                         href=href,
                         title='PDF',
                         **{'class': 'md-header-nav__button md-icon'})
        a.append(_pdf_icon())
        nav.append(a)
        return str(soup)

    return html
Ejemplo n.º 11
0
def log_section(text: str, logger: logging) -> None:
    """
    Prints a section.
    :param text:  text to print
    :param logger: logger object
    """
    logger.info(
        "==============================================================")
    logger.info(text)
    logger.info(
        "==============================================================")
Ejemplo n.º 12
0
def execute(spark: SparkSession, log: logging, config: dict):
    log.info("extract")
    in_path = config['params']['in_path']
    out_path = config['params']['out_path']
    df = spark.read.csv(in_path, header=True).repartition(120, "PHONE_NUMBER").na.fill(
        {'DEACTIVATION_DATE': '9999-12-31'})

    log.info("transform")
    df_norm = df.sort(df.DEACTIVATION_DATE.desc()).groupby(
        ['PHONE_NUMBER']
    ).agg(
        F.collect_list(df['ACTIVATION_DATE']).alias('ACTIVATION_DATE'),
        F.collect_list(df['DEACTIVATION_DATE']).alias('DEACTIVATION_DATE')
    ).withColumn(
        'ACTUAL_ACTIVE_DATE',
        udf_actual_active_date(F.col('ACTIVATION_DATE'), F.col('DEACTIVATION_DATE'))
    ).select(['PHONE_NUMBER', 'ACTUAL_ACTIVE_DATE']).withColumn(
        "TS", F.date_format(F.date_trunc("month", "ACTUAL_ACTIVE_DATE"), "yyyy-MM"))

    log.info("load")
    df_norm.write.partitionBy("TS").parquet(out_path, mode="overwrite")
    spark.read.parquet(out_path)
Ejemplo n.º 13
0
def data_preprocessing(target_num: int, persistence: str,
                       logger: logging) -> Tuple[dict, dict, dict, dict]:
    """
    workload{2~18} = workload datas composed of different key(workload2, workload3, ...) [N of configs, N of columnlabels]
    columnlabels  = Internal Metric names
    rowlabels = Index for Workload data
    internal_metric_datas = {
        'workload{2~18} except target(1)'=array([[1,2,3,...], [2,3,4,...], ...[]])
        'columnlabels'=array(['IM_1', 'IM_2', ...]),
        'rowlabels'=array([1, 2, ..., 10000])}
    """
    """
    data = concat((workload2,...,workload18)) length = 10000 * N of workload
    columnlabels  = same as internal_metric_datas's columnlabels
    rowlabels = same as internal_metric_datas's rowlabels
    aggregated_IM_data = {
        'data'=array([[1,2,3,...], [2,3,4,...], ...[]])
        'columnlabels'=array(['IM_1', 'IM_2', ...]),
        'rowlabels'=array([1, 2, ..., 10000])}
    
    """

    knobs_path: str = os.path.join(DATA_PATH, "configs")
    # if persistence == "RDB":
    #     knob_data, _ = knobs.load_knobs(knobs_path)
    # elif persistence == "AOF":
    #     _, knob_data = knobs.load_knobs(knobs_path)

    # logger.info("Finish Load Knob Data")

    internal_metric_datas = defaultdict(list)
    ops_metric_datas = {}
    latency_metric_datas = {}
    knob_datas = {}

    # len()-1 because of configs dir
    for i in range(1, len(os.listdir(DATA_PATH))):
        if target_num == i:
            ops_target_external_data: dict = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f"result_{persistence.lower()}_external_{i}.csv"),
                knobs_path=knobs_path,
                metrics=['Totals_Ops/sec'])
            latency_target_external_data: dict = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f"result_{persistence.lower()}_external_{i}.csv"),
                knobs_path=knobs_path,
                metrics=['Totals_p99_Latency'])
            target_knob_data, _ = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f'result_{persistence.lower()}_internal_{i}.csv'),
                knobs_path=knobs_path,
                persistence=persistence,
            )
        else:
            knob_data, internal_metric_data = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f'result_{persistence.lower()}_internal_{i}.csv'),
                knobs_path=knobs_path,
                persistence=persistence,
            )

            ops_metric_data: dict = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f'result_{persistence.lower()}_external_{i}.csv'),
                knobs_path=knobs_path,
                metrics=['Totals_Ops/sec'])
            latency_metric_data: dict = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f'result_{persistence.lower()}_external_{i}.csv'),
                knobs_path=knobs_path,
                metrics=['Totals_p99_Latency'])
            knob_datas[f'workload{i}'] = knob_data['data']
            internal_metric_datas[f'workload{i}'] = internal_metric_data[
                'data']
            ops_metric_datas[f'workload{i}'] = ops_metric_data['data']
            latency_metric_datas[f'workload{i}'] = latency_metric_data['data']
            internal_metric_datas['rowlabels'].extend(knob_data['rowlabels'])

    #for all train split
    #external_metric_datas[f'workload{target_num}'] = target_external_data['data']
    knob_datas['columnlabels'] = knob_data['columnlabels']
    internal_metric_datas['columnlabels'] = internal_metric_data[
        'columnlabels']
    ops_metric_datas['columnlabels'] = ['Totals_Ops/sec']
    latency_metric_datas['columnlabels'] = ['Totals_p99_Latency']
    logger.info("Finish Load Internal and External Metrics Data")

    aggregated_IM_data: dict = knobs.aggregate_datas(internal_metric_datas)
    aggregated_ops_data: dict = knobs.aggregate_datas(ops_metric_datas)
    aggregated_latency_data: dict = knobs.aggregate_datas(latency_metric_datas)
    aggregated_knob_data: dict = knobs.aggregate_datas(knob_datas)

    return aggregated_knob_data, aggregated_IM_data, aggregated_ops_data, aggregated_latency_data, target_knob_data, ops_target_external_data, latency_target_external_data
Ejemplo n.º 14
0
def log_and_print(message, logger: logging = None) -> None:
    if logger is not None:
        logger.info(message)
    print(message)
Ejemplo n.º 15
0
def main(opt: argparse, logger: logging, log_dir: str) -> Config:
    #Target workload loading
    logger.info(
        f"====================== {opt.persistence} mode ====================\n"
    )

    logger.info(f"Target workload name is {opt.target}")

    knob_data, aggregated_IM_data, aggregated_EM_data, target_knob_data, target_external_data = data_preprocessing(
        opt.target, opt.persistence, logger)

    logger.info(
        "====================== Metrics_Simplification ====================\n")
    pruned_metrics = metric_simplification(aggregated_IM_data, logger, opt)
    logger.info(
        f"Done pruning metrics for workload {opt.persistence} (# of pruned metrics: {len(pruned_metrics)}).\n\n"
        f"Pruned metrics: {pruned_metrics}\n")
    metric_idxs = [
        i for i, metric_name in enumerate(aggregated_IM_data['columnlabels'])
        if metric_name in pruned_metrics
    ]
    ranked_metric_data = {
        'data':
        aggregated_IM_data['data'][:, metric_idxs],
        'rowlabels':
        copy.deepcopy(aggregated_IM_data['rowlabels']),
        'columnlabels':
        [aggregated_IM_data['columnlabels'][i] for i in metric_idxs]
    }

    ### KNOBS RANKING STAGE ###
    rank_knob_data = copy.deepcopy(knob_data)
    logger.info(
        "====================== Run_Knobs_Ranking ====================\n")
    logger.info(f"use mode = {opt.rki}")
    ranked_knobs = knobs_ranking(knob_data=rank_knob_data,
                                 metric_data=ranked_metric_data,
                                 mode=opt.rki,
                                 logger=logger)
    logger.info(
        f"Done ranking knobs for workload {opt.persistence} (# ranked knobs: {len(ranked_knobs)}).\n\n"
        f"Ranked knobs: {ranked_knobs}\n")

    top_k: int = opt.topk
    top_k_knobs = utils.get_ranked_knob_data(ranked_knobs, knob_data, top_k)
    target_knobs = utils.get_ranked_knob_data(ranked_knobs, target_knob_data,
                                              top_k)
    knob_save_path = utils.make_date_dir('./save_knobs')
    logger.info(f"Knob save path : {knob_save_path}")
    logger.info(f"Choose Top {top_k} knobs : {top_k_knobs['columnlabels']}")
    np.save(os.path.join(knob_save_path, f'knobs_{top_k}.npy'),
            np.array(top_k_knobs['columnlabels']))

    model, optimizer, trainDataloader, valDataloader, testDataloader, scaler_y = prepare_for_training(
        opt, top_k_knobs, target_knobs, aggregated_EM_data,
        target_external_data)

    logger.info(
        f"====================== {opt.model_mode} Pre-training Stage ====================\n"
    )

    best_epoch, best_th_loss, best_la_loss, best_th_mae_loss, best_la_mae_loss, model_path = train(
        model, trainDataloader, valDataloader, testDataloader, optimizer,
        scaler_y, opt, logger)
    logger.info(
        f"\n\n[Best Epoch {best_epoch}] Best_th_Loss : {best_th_loss} Best_la_Loss : {best_la_loss} Best_th_MAE : {best_th_mae_loss} Best_la_MAE : {best_la_mae_loss}"
    )

    config = Config(opt.persistence, opt.db, opt.cluster, opt.rki, opt.topk,
                    opt.model_mode, opt.n_epochs, opt.lr)
    config.save_results(opt.target, best_epoch, best_th_loss, best_la_loss,
                        best_th_mae_loss, best_la_mae_loss, model_path,
                        log_dir, knob_save_path)

    return config
Ejemplo n.º 16
0
def processItem(c: Checker, l: logging):
    l.info('started thread {}'.format(c.websiteUrl))
    while True:
        c.process()
        c.wait()
Ejemplo n.º 17
0
def processItem(thread_id, writer: Writer, logger: logging):
    logger.info('started thread {}'.format(thread_id))
    while True:
        writer.process()
Ejemplo n.º 18
0
def data_preprocessing(target_num: int, persistence: str,
                       logger: logging) -> Tuple[dict, dict, dict, dict, dict]:
    """
    workload{2~18} = workload datas composed of different key(workload2, workload3, ...) [N of configs, N of columnlabels]
    columnlabels  = Internal Metric names
    rowlabels = Index for Workload data
    """
    target_DATA_PATH = "../data/redis_data/workload{}".format(target_num)

    knobs_path: str = os.path.join(DATA_PATH, "configs")

    internal_metric_datas = defaultdict(list)
    external_metric_datas = {}
    knob_datas = {}

    for i in range(1, 19):
        if target_num == i:
            target_external_data: dict = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    target_DATA_PATH,
                    f"result_{persistence.lower()}_external_{i}.csv"),
                knobs_path=knobs_path,
                metrics=['Totals_Ops/sec', 'Totals_p99_Latency'])
            target_knob_data, _ = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f'result_{persistence.lower()}_internal_{i}.csv'),
                knobs_path=knobs_path,
                persistence=persistence,
            )
        else:
            knob_data, internal_metric_data = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f'result_{persistence.lower()}_internal_{i}.csv'),
                knobs_path=knobs_path,
                persistence=persistence,
            )

            external_metric_data: dict = knobs.load_knob_metrics(
                metric_path=os.path.join(
                    DATA_PATH, f'workload{i}',
                    f'result_{persistence.lower()}_external_{i}.csv'),
                knobs_path=knobs_path,
                metrics=['Totals_Ops/sec', 'Totals_p99_Latency'])
            assert knob_data['data'] != external_metric_data['data'], (len(
                knob_data['data']), len(external_metric_data['data']))
            knob_datas[f'workload{i}'] = knob_data['data']
            internal_metric_datas[f'workload{i}'] = internal_metric_data[
                'data']
            external_metric_datas[f'workload{i}'] = external_metric_data[
                'data']
            internal_metric_datas['rowlabels'].extend(knob_data['rowlabels'])

    knob_datas['columnlabels'] = knob_data['columnlabels']
    internal_metric_datas['columnlabels'] = internal_metric_data[
        'columnlabels']
    external_metric_datas['columnlabels'] = [
        'Totals_Ops/sec', 'Totals_p99_Latency'
    ]
    logger.info("Finish Load knob and Internal and External Metrics Data")

    aggregated_IM_data: dict = knobs.aggregate_datas(internal_metric_datas)
    aggregated_EM_data: dict = knobs.aggregate_datas(external_metric_datas)
    aggregated_knob_data: dict = knobs.aggregate_datas(knob_datas)

    return aggregated_knob_data, aggregated_IM_data, aggregated_EM_data,\
     target_knob_data, target_external_data
Ejemplo n.º 19
0
def metric_simplification(metric_data: dict, logger: logging,
                          args: argparse) -> list:
    matrix: list = metric_data['data']
    columnlabels: list = metric_data['columnlabels']

    # Remove any constant columns
    nonconst_matrix = []
    nonconst_columnlabels = []
    for col, (_, v) in zip(matrix.T, enumerate(columnlabels)):
        if np.any(col != col[0]):
            nonconst_matrix.append(col.reshape(-1, 1))
            nonconst_columnlabels.append(v)
    assert len(nonconst_matrix) > 0, "Need more data to train the model"

    nonconst_matrix = np.hstack(nonconst_matrix)
    logger.info(
        f"Workload characterization ~ nonconst data size: {nonconst_matrix.shape}"
    )

    # Remove any duplicate columns
    unique_matrix, unique_idxs = np.unique(nonconst_matrix,
                                           axis=1,
                                           return_index=True)
    unique_columnlabels = [nonconst_columnlabels[idx] for idx in unique_idxs]

    logger.info(
        f"Workload characterization ~ final data size: {unique_matrix.shape}")
    n_rows, n_cols = unique_matrix.shape

    # Shuffle the matrix rows
    shuffle_indices = get_shuffle_indices(n_rows)
    shuffled_matrix: list = unique_matrix[shuffle_indices, :]

    #shuffled_matrix = RobustScaler().fit_transform(shuffled_matrix)
    shuffled_matrix = StandardScaler().fit_transform(shuffled_matrix)
    # shuffled_matrix = MinMaxScaler().fit_transform(shuffled_matrix)

    #FactorAnalysis
    fa_model = FactorAnalysis()
    fa_model.fit(shuffled_matrix, unique_columnlabels, n_components=5)
    # Components: metrics * factors
    components = fa_model.components_.T.copy()

    # Clustering method : Gaussian Mixture Model(GMM)
    logger.info("Clustering mode : {}".format(args.cluster))
    if args.cluster == 'gmm':
        cluster = GMMClustering(components)
        cluster.fit(components)
        pruned_metrics = cluster.get_closest_samples(unique_columnlabels)
        logger.info(f"Found optimal number of clusters: {cluster.optimK}")
    elif args.cluster == 'k-means':
        #KMeansClusters()
        kmeans_models = KMeansClusters()
        ##TODO: Check Those Options
        kmeans_models.fit(components,
                          min_cluster=1,
                          max_cluster=min(n_cols - 1, 20),
                          sample_labels=unique_columnlabels,
                          estimator_params={'n_init': 100})
        gapk = create_kselection_model("gap-statistic")
        gapk.fit(components, kmeans_models.cluster_map_)

        logger.info(
            f"Found optimal number of clusters: {gapk.optimal_num_clusters_}")
        # Get pruned metrics, cloest samples of each cluster center
        pruned_metrics = kmeans_models.cluster_map_[
            gapk.optimal_num_clusters_].get_closest_samples()

    # Clustering method : Mean Shift
    elif args.cluster == 'ms':
        ms = MeanShiftClustering(components)
        ms.fit(components)
        pruned_metrics = ms.get_closest_samples(unique_columnlabels)
        logger.info(f"Found optimal number of clusters: {len(ms.centroid)}")

    return pruned_metrics
Ejemplo n.º 20
0
def main(opt: argparse, logger: logging, log_dir: str) -> Config:
    # Target workload loading
    logger.info("====================== {} mode ====================\n".format(
        opt.persistence))
    logger.info("Target workload name is {}".format(opt.target))

    """
        load knob data and IM datas, EM datas.
    """
    ### data load ###
    knob_data, aggregated_IM_data, aggregated_ops_data, aggregated_latency_data, target_knob_data, ops_target_external_data, latency_target_external_data = data_preprocessing(
        opt.target, opt.persistence, logger)

    ### clustering ###
    logger.info(
        "====================== Metrics_Simplification ====================\n")
    pruned_metrics = metric_simplification(aggregated_IM_data, logger, opt)
    logger.info("Done pruning metrics for workload {} (# of pruned metrics: {}).\n\n""Pruned metrics: {}\n".format(
        opt.persistence, len(pruned_metrics), pruned_metrics))
    metric_idxs = [i for i, metric_name in enumerate(
        aggregated_IM_data['columnlabels']) if metric_name in pruned_metrics]
    ranked_metric_data = {
        'data': aggregated_IM_data['data'][:, metric_idxs],
        'rowlabels': copy.deepcopy(aggregated_IM_data['rowlabels']),
        'columnlabels': [aggregated_IM_data['columnlabels'][i] for i in metric_idxs]
    }
    """
        For example,
            pruned_metrics : ['allocator_rss_bytes', 'rss_overhead_bytes', 'used_memory_dataset', 'rdb_last_cow_size']
    """

    ### KNOBS RANKING STAGE ###
    rank_knob_data = copy.deepcopy(knob_data)
    logger.info(
        "====================== Run_Knobs_Ranking ====================\n")
    logger.info("use mode = {}".format(opt.rki))
    ranked_knobs = knobs_ranking(knob_data=rank_knob_data,
                                 metric_data=ranked_metric_data,
                                 mode=opt.rki,
                                 logger=logger)
    logger.info("Done ranking knobs for workload {} (# ranked knobs: {}).\n\n"
                "Ranked knobs: {}\n".format(opt.persistence, len(ranked_knobs), ranked_knobs))

    top_k: dict = opt.topk
    top_k_knobs = utils.get_ranked_knob_data(ranked_knobs, knob_data, top_k)
    target_knobs = utils.get_ranked_knob_data(
        ranked_knobs, target_knob_data, top_k)
    knob_save_path = utils.make_date_dir('./save_knobs')
    logger.info("Knob save path : {}".format(knob_save_path))
    logger.info("Choose Top {} knobs : {}".format(
        top_k, top_k_knobs['columnlabels']))
    np.save(os.path.join(knob_save_path, 'knobs_{}.npy'.format(top_k)),
            np.array(top_k_knobs['columnlabels']))

    # In double version
    aggregated_data = [aggregated_ops_data, aggregated_latency_data]
    target_external_data = [
        ops_target_external_data, latency_target_external_data]
    if not opt.atr:
        model, optimizer = set_model(opt)
        model_save_path = utils.make_date_dir("./model_save")
        logger.info("Model save path : {}".format(model_save_path))
        logger.info("Learning Rate : {}".format(opt.lr))
        best_epoch, best_loss, best_mae = defaultdict(
            int), defaultdict(float), defaultdict(float)
        columns = ['Totals_Ops/sec', 'Totals_p99_Latency']

        ### train dnn ###
        for i in range(2):
            trainDataloader, valDataloader, testDataloader, scaler_y = prepareForTraining(
                opt, top_k_knobs, target_knobs, aggregated_data[i], target_external_data[i], i)
            logger.info(
                "====================== {} Pre-training Stage ====================\n".format(opt.model_mode))

            best_epoch[columns[i]], best_loss[columns[i]], best_mae[columns[i]] = train(
                model, trainDataloader, valDataloader, testDataloader, optimizer, scaler_y, opt, logger, model_save_path, i)

        for name in best_epoch.keys():
            logger.info("\n\n[{} Best Epoch {}] Best_Loss : {} Best_MAE : {}".format(
                name, best_epoch[name], best_loss[name], best_mae[name]))

        config = Config(opt.persistence, opt.db, opt.cluster, opt.rki,
                        opt.topk, opt.model_mode, opt.n_epochs, opt.lr)
        config.save_double_results(opt.target, best_epoch['Totals_Ops/sec'], best_epoch[name], best_loss['Totals_Ops/sec'],
                                best_loss[name], best_mae['Totals_Ops/sec'], best_mae[name], model_save_path, log_dir, knob_save_path)
        return config
    else:
        models = set_rf_model()
        for i in range(2):
            X_tr, y_train = prepare_ATR_learning(
                    opt, top_k_knobs, target_knobs, aggregated_data[i], target_external_data[i], i)        
            models[i].fit(X_tr, y_train)
        
        pruned_configs, external_datas, defaults, scaler_X, scaler_ys = double_prepareForGA(opt, top_k_knobs['columnlabels'])
        current_solution_pools, targets = make_solution_pool(opt, pruned_configs, external_datas, defaults)
        fitness_function = RF_fitness

        n_configs = top_k_knobs['columnlabels'].shape[0]
        #set remain ratio
        n_pool_half = opt.n_pool//2
        #mutation ratio
        mutation = int(n_configs*0.5)
        GA_options = [n_configs, n_pool_half, mutation]

        top_k_config_path, name, connect = ATR_GA(opt, models, targets, top_k_knobs, current_solution_pools, fitness_function, GA_options, scaler_X, scaler_ys, logger)

        if connect:
            server_connection(opt, top_k_config_path, name)
        else:
            logger.info("Because appednfsync is 'always', Fin GA")
            return 0
    
        import datetime
        #save results
        i = 0
        today = datetime.datetime.now()
        name = 'result_'+opt.persistence+'-'+today.strftime('%Y%m%d')+'-'+'%02d'%i+'.csv'
        while os.path.exists(os.path.join('./GA_config/', name)):
            i += 1
            name = 'result_'+opt.persistence+'-'+today.strftime('%Y%m%d')+'-'+'%02d'%i+'.csv'
        os.rename(f'./GA_config/result_{opt.persistence.lower()}_external_GA.csv', './GA_config/'+name)
        logger.info(name)
        df = pd.read_csv('./GA_config/'+name)
        logger.info(df["Totals_Ops/sec"])
        logger.info(df["Totals_p99_Latency"])
def register(dp: Dispatcher, logger: logging) -> bool:
    dp.register_message_handler(callback=start, commands=["start"])
    dp.register_message_handler(callback=result_rust, commands=["check"])
    dp.register_message_handler(callback=all, text="1")
    logger.info("end reg")
    return True