Esempio n. 1
0
def batch_ip_scanner_single_threaded(start_ip_str,
                                     end_ip_str,
                                     num_worker_threads=100,
                                     ip_version='ipv4'):
    from timeutils import Stopwatch
    sw = Stopwatch(start=True)
    IPAddress = ipaddress.IPv4Address
    if (ip_version == "ipv6"):
        IPAddress = ipaddress.IPv6Address

    def scan(ip_address):
        if (ip_is_up(ip_address)):
            res_queue.put((ip_address, "up"))
        else:
            res_queue.put((ip_address, "down"))

    res_queue = Queue()

    start_ip = IPAddress(start_ip_str)
    end_ip = IPAddress(end_ip_str)
    for ip_int in range(int(start_ip), int(end_ip)):
        ip_str = str(IPAddress(ip_int))
        scan(ip_str)

    print("time taken : {} seconds".format(sw.elapsed_seconds))
    return queue_to_list(res_queue)
Esempio n. 2
0
    def fit_model(self):
        def mape_eval(y_pred, dval):
            y_val = dval.get_label()
            mape_score = np.mean(np.abs(y_val - y_pred) / y_val)
            return 'mape_score', mape_score

        def f1_eval(y_pred, dval):
            y_true = dval.get_label()
            f1score = f1_score(y_true, np.round(y_pred))
            return 'f1score', f1score

        X_train, y_train, X_val, y_val = self.prep_data()

        dtrain = xgb.DMatrix(data=X_train.values, label=y_train.values)
        dval = xgb.DMatrix(data=X_val.values, label=y_val.values)

        watch_list = [(dval, 'test')]

        if self.model_name == 'clf':
            self.params['scale_pos_weight'] = load_param_json(
                get_params_dir('imb_ratio.json'))['imb_ratio']

        func_mapping = {'reg': mape_eval, 'clf': f1_eval}

        f_eval = func_mapping[self.model_name]

        old_stdout = sys.stdout

        sw = Stopwatch(start=True)

        logger.info('Training {} model...'.format(self.model_name))

        sys.stdout = open(str(get_model_dir(self.model_name + '.log')), 'w')
        sys.stdout = FlushFile(sys.stdout)

        model = xgb.train(params=self.params,
                          dtrain=dtrain,
                          num_boost_round=self.params['num_round'],
                          evals=watch_list,
                          feval=f_eval,
                          maximize=True if self.model_name == 'clf' else False,
                          early_stopping_rounds=100,
                          verbose_eval=True)

        sys.stdout = old_stdout

        logger.info('best_ntree_limit: {}'.format(model.best_ntree_limit))

        logger.info('Elapsed time of training {} model: {}'.format(
            self.model_name, sw.elapsed.human_str()))

        y_pred, metric_1, metric_2, metric_3, metric_4 = self.validation(
            X_val, y_val, model)

        dump(model, get_model_dir(self.model_name + '-model'))

        return y_pred, metric_1, metric_2, metric_3, metric_4
Esempio n. 3
0
    def fit(self):

        # Transform data into sparse matrix
        hits_matrix, item_dict = self.transform_data()

        # Create a models from the input data
        self.model = self.get_model()

        self.app_logger.info(msg='Training the nearest neighbors model')

        sw = Stopwatch(start=True)

        self.model.fit(hits_matrix, show_progress=True)

        self.app_logger.info(msg='Elapsed time of model training: {}'.format(
            sw.elapsed.human_str()))

        return self.model.similarity, item_dict
Esempio n. 4
0
    def brx_data_prep(self, chunk_size=50000):

        sw = Stopwatch(start=True)

        date_dataset = pd.DataFrame()

        if self.ext:
            brx_dataset, ads_dataset, _ = BrxRet(self.start_date, self.end_date, self.ext, self.non_adj).ret()
        else:
            brx_dataset, ads_dataset, date_dataset = BrxRet(self.start_date, self.end_date, self.ext, self.non_adj).ret()

        ads_dataset = BrxPrep.ads_prep(ads_dataset)

        if self.ext:
            brx_dataset.drop(columns=['conversion_po'], axis=1, inplace=True)

        user_gen = BrxPrep.user_gens(brx_dataset, chunk_size)

        brx_feats = pd.DataFrame()

        count = 0

        for Ids in user_gen:

            count += 1

            logger.info('The number of users in sublist {}: {}'.format(count, len(Ids)))

            brx_pt_subset = brx_dataset.pipe(lambda x: x[x.ID.isin(Ids)])

            brx_pt_subset = self.cat_feats(brx_pt_subset)

            brx_feats = brx_feats.append(brx_pt_subset, sort=False)

        brx_feats = BrxPrep.post_feats(brx_feats)

        if self.ext:
            logger.info('Elapsed time of brx ETL (pt): {}'.format(sw.elapsed.human_str()))
        else:
            logger.info('Elapsed time of brx ETL (po): {}'.format(sw.elapsed.human_str()))

        return brx_feats, ads_dataset, date_dataset
Esempio n. 5
0
    def trx_data_prep(self):

        sw = Stopwatch(start=True)

        trx_dataset = TrxRet(self.start_date, self.end_date, self.ext).ret()

        cart_feats = self.data_cart(trx_dataset)

        trx_dataset_net = trx_dataset[trx_dataset.nNet > 0].copy()

        net_item_feats = self.data_net_item(trx_dataset_net)

        net_cart_feats = self.data_net_cart(trx_dataset_net)

        ret_dataset = trx_dataset[trx_dataset.nR > 0].copy()

        return_feats = self.data_return(ret_dataset)

        res_dataset = TrxPrep.flag_res(trx_dataset)

        dict_df = {
            'cart_feats': cart_feats,
            'net_item_feats': net_item_feats,
            'net_cart_feats': net_cart_feats,
            'return_feats': return_feats,
            'resellers_flag': res_dataset
        }

        trx_feats = TrxPrep.merge_feats(dict_df)

        trx_feats = TrxPrep.post_res(trx_feats)

        logger.info('trx shape: {}'.format(trx_feats.shape))

        if self.ext:
            logger.info('Elapsed time of trx ETL (pt): {}'.format(
                sw.elapsed.human_str()))
        else:
            logger.info('Elapsed time of trx ETL (po): {}'.format(
                sw.elapsed.human_str()))

        return trx_feats
Esempio n. 6
0
def batch_ip_scanner_multi_threaded(start_ip_str,
                                    end_ip_str,
                                    num_worker_threads=200,
                                    ip_version='ipv4'):
    from timeutils import Stopwatch
    sw = Stopwatch(start=True)
    IPAddress = ipaddress.IPv4Address
    if (ip_version == "ipv6"):
        IPAddress = ipaddress.IPv6Address

    def scan(ip_address):
        if (ip_is_up(ip_address)):
            res_queue.put((ip_address, "up"))
        else:
            res_queue.put((ip_address, "down"))

    def worker():
        while True:
            item = task_queue.get()
            scan(item)
            task_queue.task_done()

    task_queue = Queue()
    res_queue = Queue()
    for i in range(num_worker_threads):
        t = Thread(target=worker)
        t.daemon = True
        t.start()

    start_ip = IPAddress(start_ip_str)
    end_ip = IPAddress(end_ip_str)
    for ip_int in range(int(start_ip), int(end_ip)):
        ip_str = str(IPAddress(ip_int))
        task_queue.put(ip_str)

    task_queue.join()
    print("time taken : {} seconds".format(sw.elapsed_seconds))
    return queue_to_list(res_queue)
Esempio n. 7
0
    def train(
        self,
        hits_data: DataFrame = None,
    ) -> Model:
        self.hits_data = self._col_transform(hits_data)

        # Transform data into sparse matrix
        hits_matrix, item_dict, user_dict = self._data_mapping()

        # Create a models from the input data
        model = self._get_model()

        self.app_logger.info("Training model")

        sw = Stopwatch(start=True)

        # Train the model
        model.fit(hits_matrix, show_progress=True)

        self.app_logger.info("Elapsed time of model training: {}".format(
            sw.elapsed.human_str()))

        return RecPred(model, model.similarity, item_dict, user_dict)
Esempio n. 8
0
    def tuning(self):

        s3_bucket, id, secret = s3_aws_engine(name=self.aws_env)

        s3_path = ModelTune._aws_s3_path(s3_bucket)

        boto_sess = ModelTune._boto_session(id, secret)

        logger.info('Getting algorithm image URI...')

        container = get_image_uri(boto_sess.region_name,
                                  'xgboost',
                                  repo_version='0.90-1')

        logger.info('Creating sagemaker session...')

        sage_sess = sagemaker.Session(boto_sess)

        s3_input_train, s3_input_val = self.fetch_data(s3_path)

        logger.info(
            'Creating sagemaker estimator to train using the supplied {} model...'
            .format(self.model_name))

        if self.model_name == 'clf':
            train_instance_type = 'ml.m5.4xlarge'
        else:
            train_instance_type = 'ml.m5.2xlarge'

        est = Estimator(container,
                        role=self.role,
                        train_instance_count=1,
                        train_instance_type=train_instance_type,
                        output_path=s3_path + 'tuning_' + self.model_name +
                        '/',
                        sagemaker_session=sage_sess,
                        base_job_name=self.model_name + '-tuning-job')

        logger.info('Setting hyper-parameters...')

        hyperparameter_ranges = {
            'num_round': IntegerParameter(1, 4000),
            'eta': ContinuousParameter(0, 0.5),
            'max_depth': IntegerParameter(1, 10),
            'min_child_weight': ContinuousParameter(0, 120),
            'subsample': ContinuousParameter(0.5, 1),
            'colsample_bytree': ContinuousParameter(0.5, 1),
            'gamma': ContinuousParameter(0, 5),
            'lambda': ContinuousParameter(0, 1000),
            'alpha': ContinuousParameter(0, 1000)
        }

        if self.model_name == 'clf':
            est.set_hyperparameters(
                objective='reg:logistic',
                scale_pos_weight=self._get_imb_ratio()['imb_ratio'])
            objective_metric_name = 'validation:f1'
            objective_type = 'Maximize'
        else:
            est.set_hyperparameters(objective='reg:linear')
            objective_metric_name = 'validation:rmse'
            objective_type = 'Minimize'

        if est.hyperparam_dict is None:
            raise ValueError('Hyper-parameters are missing')
        else:
            logger.info(est.hyperparam_dict)

        tuner = HyperparameterTuner(
            estimator=est,
            objective_metric_name=objective_metric_name,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_type=objective_type,
            max_jobs=100,
            max_parallel_jobs=10)

        sw = Stopwatch(start=True)

        tuner.fit({'train': s3_input_train, 'validation': s3_input_val})

        self.post_tune(sage_sess, tuner)

        logger.info('Elapsed time of tuning: {}'.format(
            sw.elapsed.human_str()))
Esempio n. 9
0
                        type=str,
                        default='ssense-cltv-qa')

    parser.add_argument('--model',
                        action='store',
                        help="Name of model",
                        dest='model',
                        type=str,
                        default='reg')

    return parser.parse_args()


if __name__ == '__main__':

    sw = Stopwatch(start=True)

    args = get_args()

    data_ext = DataExt(last_n_weeks=args.last_n_weeks,
                       aws_env=args.aws_env,
                       calib=True)

    data_ext.extract_transform_load()

    ml_tune = ModelTune(model_name=args.model, aws_env=args.aws_env)

    ml_tune.tuning()

    logger.info('Total elapsed time: {}'.format(sw.elapsed.human_str()))
Esempio n. 10
0
    def prep(self, calib):

        sw = Stopwatch(start=True)

        clf_train = pd.DataFrame()
        clf_val = pd.DataFrame()
        reg_train = pd.DataFrame()
        reg_val = pd.DataFrame()

        X_pred = self.po_x()

        if not calib:
            pres_cols = load(get_data_dir('features.pkl'))
            diff_cols = np.setdiff1d(pres_cols, X_pred.columns.values)
            for col in diff_cols:
                X_pred[col] = np.nan
        else:
            pres_cols = np.zeros(shape=X_pred.shape[1])

        if calib:

            X = self.pt_x()

            y_label, y_value = self.po_y(margin_val=X[['marginCAD_sum_cart']],
                                         index_val=X.index)

            pres_cols = X.columns.intersection(X_pred.columns)

            logger.info('Columns Intersection: {}'.format(pres_cols.shape[0]))

            X = X[pres_cols]

            X.fillna(value=-9999, inplace=True)

            logger.info('X shape: {}'.format(X.shape))

            np.savetxt(get_data_dir('features.txt'),
                       X.columns.ravel(),
                       fmt='%s')

            DataPrep.dump_data(X.columns.ravel(), 'features.pkl')

            logger.info('Building train and test splits per each model...')

            # Build train and test sets

            indP = (y_value['LTV_52W'].values >= np.log1p(20))

            X_clf_train, X_clf_val, y_clf_train, y_clf_val = train_test_split(
                X,
                y_label,
                test_size=self.test_size,
                random_state=42,
                stratify=y_label)

            clf_train = DataPrep.concat_datasets(X_clf_train, y_clf_train)
            clf_val = DataPrep.concat_datasets(X_clf_val, y_clf_val)

            imb_ratio = float(
                np.sum(y_clf_train == 0) / np.sum(y_clf_train == 1))

            param_dict = dict()
            param_dict['imb_ratio'] = imb_ratio

            dump_param_json(param_dict, get_params_dir('imb_ratio.json'))

            logger.info(
                'The balance of positive and negative rates: {}'.format(
                    param_dict['imb_ratio']))

            logger.info('X_clf_train and X_clf_val shapes: {}, {}'.format(
                X_clf_train.shape, X_clf_val.shape))
            logger.info('y_clf_train and y_clf_val shapes: {}, {}'.format(
                y_clf_train.shape, y_clf_val.shape))

            # DataPrep.dump_data(clf_train, 'clf_train.pkl')
            # DataPrep.dump_data(clf_val, 'clf_val.pkl')

            # DataPrep.dump_data(clf_train, 'clf_train.csv', pkl_format=False)
            # DataPrep.dump_data(clf_val, 'clf_val.csv', pkl_format=False, train_dir=False)

            X_reg_train, y_reg_train = DataPrep.reg_prep(
                X, y_value, indP, X_clf_train, y_clf_train)
            X_reg_val, y_reg_val = DataPrep.reg_prep(X, y_value, indP,
                                                     X_clf_val, y_clf_val)

            reg_train = DataPrep.concat_datasets(X_reg_train, y_reg_train)
            reg_val = DataPrep.concat_datasets(X_reg_val, y_reg_val)

            logger.info('X_reg_train and X_reg_val shapes: {}, {}'.format(
                X_reg_train.shape, X_reg_val.shape))
            logger.info('y_reg_train and y_reg_val shapes: {}, {}'.format(
                y_reg_train.shape, y_reg_val.shape))

            # DataPrep.dump_data(reg_train, 'reg_train.pkl')
            # DataPrep.dump_data(reg_val, 'reg_val.pkl')

            # DataPrep.dump_data(reg_train, 'reg_train.csv', pkl_format=False)
            # DataPrep.dump_data(reg_val, 'reg_val.csv', pkl_format=False, train_dir=False)

            # self._push_to_s3(local_path=str(S3_DIR)+'/')

            DataPrep.dump_data(self.ads_pt, 'ads_pt.pkl')

        X_pred = X_pred[pres_cols]

        X_pred.fillna(value=-9999, inplace=True)

        logger.info('X_pred shape: {}'.format(X_pred.shape))

        # dump(X_pred, get_data_dir('X_pred.pkl'))

        DataPrep.dump_data(self.ads_po, 'ads_po.pkl')

        DataPrep.dump_data(self.date_po, 'date_po.pkl')

        logger.info('Elapsed time of preparing data: {}'.format(
            sw.elapsed.human_str()))

        return clf_train, clf_val, reg_train, reg_val, X_pred
Esempio n. 11
0
    def train(self):

        s3_bucket, id, secret = s3_aws_engine(name=self.aws_env)

        s3_path = RemoteTrain._aws_s3_path(s3_bucket)

        boto_sess = RemoteTrain._boto_session(id, secret)

        logger.info('Getting algorithm image URI...')

        container = get_image_uri(boto_sess.region_name,
                                  'xgboost',
                                  repo_version='0.90-1')

        logger.info('Creating sagemaker session...')

        sage_sess = sagemaker.Session(boto_sess)

        s3_input_train, s3_input_val = self.load_data(s3_path)

        logger.info(
            'Creating sagemaker estimator to train using the supplied {} model...'
            .format(self.model_name))

        if self.model_name == 'clf':
            train_instance_type = 'ml.m5.4xlarge'
        else:
            train_instance_type = 'ml.m5.2xlarge'

        est = Estimator(container,
                        role=self.role,
                        train_instance_count=1,
                        train_instance_type=train_instance_type,
                        output_path=s3_path + 'model_' + self.model_name + '/',
                        sagemaker_session=sage_sess,
                        base_job_name=self.model_name + '-job')

        logger.info('Setting hyper-parameters...')

        est.set_hyperparameters(**self.params)

        if self.model_name == 'clf':
            est.set_hyperparameters(
                scale_pos_weight=self._get_imb_ratio()['imb_ratio'])

        if est.hyperparam_dict is None:
            raise ValueError('Hyper-parameters are missing')
        else:
            logger.info(est.hyperparam_dict)

        sw = Stopwatch(start=True)

        est.fit({'train': s3_input_train, 'validation': s3_input_val})

        # The following method is inconsistent with newer version of xgboost
        try:
            est.training_job_analytics.export_csv(
                get_model_dir(self.model_name + '_aws_metrics.csv'))
        except:
            pass

        logger.info('Elapsed time of training: {}'.format(
            sw.elapsed.human_str()))

        job_name = est.latest_training_job.job_name

        self.dump_model(boto_sess, s3_bucket, job_name)

        self.extract_model()

        self._validation()
Esempio n. 12
0
    def extract_transform_load(self,
                               brx_threshold=0.01,
                               trx_threshold=0.01,
                               ext=True):

        sw = Stopwatch(start=True)

        trx_pt = pd.DataFrame()
        brx_pt = pd.DataFrame()
        ads_pt = pd.DataFrame()

        logger.info('Extracting invoice data: {} to {}...'.format(
            self.start_po, self.end_po))

        DataRet(self.start_po,
                self.end_po).invoice_ext(table_id='_invoices_po')

        if self.calib:

            logger.info('Extracting training data: {} to {}...'.format(
                self.start_pt, self.end_pt))

            DataRet(self.start_pt,
                    self.end_pt).invoice_ext(table_id='_invoices')

            trx_prep = TrxPrep(self.start_pt,
                               self.end_pt,
                               trx_threshold,
                               ext=ext)
            trx_pt = trx_prep.trx_data_prep()

            brx_prep = BrxPrep(self.start_pt,
                               self.end_pt,
                               brx_threshold,
                               ext=ext,
                               non_adj=self.non_adj)
            brx_pt, ads_pt, _ = brx_prep.brx_data_prep()

        logger.info('Extracting observation data: {} to {}...'.format(
            self.start_po, self.end_po))

        trx_prep = TrxPrep(self.start_po,
                           self.end_po,
                           trx_threshold,
                           ext=False)
        trx_po = trx_prep.trx_data_prep()

        brx_prep = BrxPrep(self.start_po,
                           self.end_po,
                           brx_threshold,
                           ext=False,
                           non_adj=self.non_adj)
        brx_po, ads_po, date_po = brx_prep.brx_data_prep()

        data_prep = DataPrep(trx_pt,
                             brx_pt,
                             ads_pt,
                             trx_po,
                             brx_po,
                             ads_po,
                             date_po,
                             test_size=0.20,
                             aws_env=self.aws_env)
        clf_train, clf_val, reg_train, reg_val, X_pred = data_prep.prep(
            self.calib)

        logger.info('Elapsed time of ETL job: {}'.format(
            sw.elapsed.human_str()))

        return clf_train, clf_val, reg_train, reg_val, X_pred