def batch_ip_scanner_single_threaded(start_ip_str, end_ip_str, num_worker_threads=100, ip_version='ipv4'): from timeutils import Stopwatch sw = Stopwatch(start=True) IPAddress = ipaddress.IPv4Address if (ip_version == "ipv6"): IPAddress = ipaddress.IPv6Address def scan(ip_address): if (ip_is_up(ip_address)): res_queue.put((ip_address, "up")) else: res_queue.put((ip_address, "down")) res_queue = Queue() start_ip = IPAddress(start_ip_str) end_ip = IPAddress(end_ip_str) for ip_int in range(int(start_ip), int(end_ip)): ip_str = str(IPAddress(ip_int)) scan(ip_str) print("time taken : {} seconds".format(sw.elapsed_seconds)) return queue_to_list(res_queue)
def fit_model(self): def mape_eval(y_pred, dval): y_val = dval.get_label() mape_score = np.mean(np.abs(y_val - y_pred) / y_val) return 'mape_score', mape_score def f1_eval(y_pred, dval): y_true = dval.get_label() f1score = f1_score(y_true, np.round(y_pred)) return 'f1score', f1score X_train, y_train, X_val, y_val = self.prep_data() dtrain = xgb.DMatrix(data=X_train.values, label=y_train.values) dval = xgb.DMatrix(data=X_val.values, label=y_val.values) watch_list = [(dval, 'test')] if self.model_name == 'clf': self.params['scale_pos_weight'] = load_param_json( get_params_dir('imb_ratio.json'))['imb_ratio'] func_mapping = {'reg': mape_eval, 'clf': f1_eval} f_eval = func_mapping[self.model_name] old_stdout = sys.stdout sw = Stopwatch(start=True) logger.info('Training {} model...'.format(self.model_name)) sys.stdout = open(str(get_model_dir(self.model_name + '.log')), 'w') sys.stdout = FlushFile(sys.stdout) model = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.params['num_round'], evals=watch_list, feval=f_eval, maximize=True if self.model_name == 'clf' else False, early_stopping_rounds=100, verbose_eval=True) sys.stdout = old_stdout logger.info('best_ntree_limit: {}'.format(model.best_ntree_limit)) logger.info('Elapsed time of training {} model: {}'.format( self.model_name, sw.elapsed.human_str())) y_pred, metric_1, metric_2, metric_3, metric_4 = self.validation( X_val, y_val, model) dump(model, get_model_dir(self.model_name + '-model')) return y_pred, metric_1, metric_2, metric_3, metric_4
def fit(self): # Transform data into sparse matrix hits_matrix, item_dict = self.transform_data() # Create a models from the input data self.model = self.get_model() self.app_logger.info(msg='Training the nearest neighbors model') sw = Stopwatch(start=True) self.model.fit(hits_matrix, show_progress=True) self.app_logger.info(msg='Elapsed time of model training: {}'.format( sw.elapsed.human_str())) return self.model.similarity, item_dict
def brx_data_prep(self, chunk_size=50000): sw = Stopwatch(start=True) date_dataset = pd.DataFrame() if self.ext: brx_dataset, ads_dataset, _ = BrxRet(self.start_date, self.end_date, self.ext, self.non_adj).ret() else: brx_dataset, ads_dataset, date_dataset = BrxRet(self.start_date, self.end_date, self.ext, self.non_adj).ret() ads_dataset = BrxPrep.ads_prep(ads_dataset) if self.ext: brx_dataset.drop(columns=['conversion_po'], axis=1, inplace=True) user_gen = BrxPrep.user_gens(brx_dataset, chunk_size) brx_feats = pd.DataFrame() count = 0 for Ids in user_gen: count += 1 logger.info('The number of users in sublist {}: {}'.format(count, len(Ids))) brx_pt_subset = brx_dataset.pipe(lambda x: x[x.ID.isin(Ids)]) brx_pt_subset = self.cat_feats(brx_pt_subset) brx_feats = brx_feats.append(brx_pt_subset, sort=False) brx_feats = BrxPrep.post_feats(brx_feats) if self.ext: logger.info('Elapsed time of brx ETL (pt): {}'.format(sw.elapsed.human_str())) else: logger.info('Elapsed time of brx ETL (po): {}'.format(sw.elapsed.human_str())) return brx_feats, ads_dataset, date_dataset
def trx_data_prep(self): sw = Stopwatch(start=True) trx_dataset = TrxRet(self.start_date, self.end_date, self.ext).ret() cart_feats = self.data_cart(trx_dataset) trx_dataset_net = trx_dataset[trx_dataset.nNet > 0].copy() net_item_feats = self.data_net_item(trx_dataset_net) net_cart_feats = self.data_net_cart(trx_dataset_net) ret_dataset = trx_dataset[trx_dataset.nR > 0].copy() return_feats = self.data_return(ret_dataset) res_dataset = TrxPrep.flag_res(trx_dataset) dict_df = { 'cart_feats': cart_feats, 'net_item_feats': net_item_feats, 'net_cart_feats': net_cart_feats, 'return_feats': return_feats, 'resellers_flag': res_dataset } trx_feats = TrxPrep.merge_feats(dict_df) trx_feats = TrxPrep.post_res(trx_feats) logger.info('trx shape: {}'.format(trx_feats.shape)) if self.ext: logger.info('Elapsed time of trx ETL (pt): {}'.format( sw.elapsed.human_str())) else: logger.info('Elapsed time of trx ETL (po): {}'.format( sw.elapsed.human_str())) return trx_feats
def batch_ip_scanner_multi_threaded(start_ip_str, end_ip_str, num_worker_threads=200, ip_version='ipv4'): from timeutils import Stopwatch sw = Stopwatch(start=True) IPAddress = ipaddress.IPv4Address if (ip_version == "ipv6"): IPAddress = ipaddress.IPv6Address def scan(ip_address): if (ip_is_up(ip_address)): res_queue.put((ip_address, "up")) else: res_queue.put((ip_address, "down")) def worker(): while True: item = task_queue.get() scan(item) task_queue.task_done() task_queue = Queue() res_queue = Queue() for i in range(num_worker_threads): t = Thread(target=worker) t.daemon = True t.start() start_ip = IPAddress(start_ip_str) end_ip = IPAddress(end_ip_str) for ip_int in range(int(start_ip), int(end_ip)): ip_str = str(IPAddress(ip_int)) task_queue.put(ip_str) task_queue.join() print("time taken : {} seconds".format(sw.elapsed_seconds)) return queue_to_list(res_queue)
def train( self, hits_data: DataFrame = None, ) -> Model: self.hits_data = self._col_transform(hits_data) # Transform data into sparse matrix hits_matrix, item_dict, user_dict = self._data_mapping() # Create a models from the input data model = self._get_model() self.app_logger.info("Training model") sw = Stopwatch(start=True) # Train the model model.fit(hits_matrix, show_progress=True) self.app_logger.info("Elapsed time of model training: {}".format( sw.elapsed.human_str())) return RecPred(model, model.similarity, item_dict, user_dict)
def tuning(self): s3_bucket, id, secret = s3_aws_engine(name=self.aws_env) s3_path = ModelTune._aws_s3_path(s3_bucket) boto_sess = ModelTune._boto_session(id, secret) logger.info('Getting algorithm image URI...') container = get_image_uri(boto_sess.region_name, 'xgboost', repo_version='0.90-1') logger.info('Creating sagemaker session...') sage_sess = sagemaker.Session(boto_sess) s3_input_train, s3_input_val = self.fetch_data(s3_path) logger.info( 'Creating sagemaker estimator to train using the supplied {} model...' .format(self.model_name)) if self.model_name == 'clf': train_instance_type = 'ml.m5.4xlarge' else: train_instance_type = 'ml.m5.2xlarge' est = Estimator(container, role=self.role, train_instance_count=1, train_instance_type=train_instance_type, output_path=s3_path + 'tuning_' + self.model_name + '/', sagemaker_session=sage_sess, base_job_name=self.model_name + '-tuning-job') logger.info('Setting hyper-parameters...') hyperparameter_ranges = { 'num_round': IntegerParameter(1, 4000), 'eta': ContinuousParameter(0, 0.5), 'max_depth': IntegerParameter(1, 10), 'min_child_weight': ContinuousParameter(0, 120), 'subsample': ContinuousParameter(0.5, 1), 'colsample_bytree': ContinuousParameter(0.5, 1), 'gamma': ContinuousParameter(0, 5), 'lambda': ContinuousParameter(0, 1000), 'alpha': ContinuousParameter(0, 1000) } if self.model_name == 'clf': est.set_hyperparameters( objective='reg:logistic', scale_pos_weight=self._get_imb_ratio()['imb_ratio']) objective_metric_name = 'validation:f1' objective_type = 'Maximize' else: est.set_hyperparameters(objective='reg:linear') objective_metric_name = 'validation:rmse' objective_type = 'Minimize' if est.hyperparam_dict is None: raise ValueError('Hyper-parameters are missing') else: logger.info(est.hyperparam_dict) tuner = HyperparameterTuner( estimator=est, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type=objective_type, max_jobs=100, max_parallel_jobs=10) sw = Stopwatch(start=True) tuner.fit({'train': s3_input_train, 'validation': s3_input_val}) self.post_tune(sage_sess, tuner) logger.info('Elapsed time of tuning: {}'.format( sw.elapsed.human_str()))
type=str, default='ssense-cltv-qa') parser.add_argument('--model', action='store', help="Name of model", dest='model', type=str, default='reg') return parser.parse_args() if __name__ == '__main__': sw = Stopwatch(start=True) args = get_args() data_ext = DataExt(last_n_weeks=args.last_n_weeks, aws_env=args.aws_env, calib=True) data_ext.extract_transform_load() ml_tune = ModelTune(model_name=args.model, aws_env=args.aws_env) ml_tune.tuning() logger.info('Total elapsed time: {}'.format(sw.elapsed.human_str()))
def prep(self, calib): sw = Stopwatch(start=True) clf_train = pd.DataFrame() clf_val = pd.DataFrame() reg_train = pd.DataFrame() reg_val = pd.DataFrame() X_pred = self.po_x() if not calib: pres_cols = load(get_data_dir('features.pkl')) diff_cols = np.setdiff1d(pres_cols, X_pred.columns.values) for col in diff_cols: X_pred[col] = np.nan else: pres_cols = np.zeros(shape=X_pred.shape[1]) if calib: X = self.pt_x() y_label, y_value = self.po_y(margin_val=X[['marginCAD_sum_cart']], index_val=X.index) pres_cols = X.columns.intersection(X_pred.columns) logger.info('Columns Intersection: {}'.format(pres_cols.shape[0])) X = X[pres_cols] X.fillna(value=-9999, inplace=True) logger.info('X shape: {}'.format(X.shape)) np.savetxt(get_data_dir('features.txt'), X.columns.ravel(), fmt='%s') DataPrep.dump_data(X.columns.ravel(), 'features.pkl') logger.info('Building train and test splits per each model...') # Build train and test sets indP = (y_value['LTV_52W'].values >= np.log1p(20)) X_clf_train, X_clf_val, y_clf_train, y_clf_val = train_test_split( X, y_label, test_size=self.test_size, random_state=42, stratify=y_label) clf_train = DataPrep.concat_datasets(X_clf_train, y_clf_train) clf_val = DataPrep.concat_datasets(X_clf_val, y_clf_val) imb_ratio = float( np.sum(y_clf_train == 0) / np.sum(y_clf_train == 1)) param_dict = dict() param_dict['imb_ratio'] = imb_ratio dump_param_json(param_dict, get_params_dir('imb_ratio.json')) logger.info( 'The balance of positive and negative rates: {}'.format( param_dict['imb_ratio'])) logger.info('X_clf_train and X_clf_val shapes: {}, {}'.format( X_clf_train.shape, X_clf_val.shape)) logger.info('y_clf_train and y_clf_val shapes: {}, {}'.format( y_clf_train.shape, y_clf_val.shape)) # DataPrep.dump_data(clf_train, 'clf_train.pkl') # DataPrep.dump_data(clf_val, 'clf_val.pkl') # DataPrep.dump_data(clf_train, 'clf_train.csv', pkl_format=False) # DataPrep.dump_data(clf_val, 'clf_val.csv', pkl_format=False, train_dir=False) X_reg_train, y_reg_train = DataPrep.reg_prep( X, y_value, indP, X_clf_train, y_clf_train) X_reg_val, y_reg_val = DataPrep.reg_prep(X, y_value, indP, X_clf_val, y_clf_val) reg_train = DataPrep.concat_datasets(X_reg_train, y_reg_train) reg_val = DataPrep.concat_datasets(X_reg_val, y_reg_val) logger.info('X_reg_train and X_reg_val shapes: {}, {}'.format( X_reg_train.shape, X_reg_val.shape)) logger.info('y_reg_train and y_reg_val shapes: {}, {}'.format( y_reg_train.shape, y_reg_val.shape)) # DataPrep.dump_data(reg_train, 'reg_train.pkl') # DataPrep.dump_data(reg_val, 'reg_val.pkl') # DataPrep.dump_data(reg_train, 'reg_train.csv', pkl_format=False) # DataPrep.dump_data(reg_val, 'reg_val.csv', pkl_format=False, train_dir=False) # self._push_to_s3(local_path=str(S3_DIR)+'/') DataPrep.dump_data(self.ads_pt, 'ads_pt.pkl') X_pred = X_pred[pres_cols] X_pred.fillna(value=-9999, inplace=True) logger.info('X_pred shape: {}'.format(X_pred.shape)) # dump(X_pred, get_data_dir('X_pred.pkl')) DataPrep.dump_data(self.ads_po, 'ads_po.pkl') DataPrep.dump_data(self.date_po, 'date_po.pkl') logger.info('Elapsed time of preparing data: {}'.format( sw.elapsed.human_str())) return clf_train, clf_val, reg_train, reg_val, X_pred
def train(self): s3_bucket, id, secret = s3_aws_engine(name=self.aws_env) s3_path = RemoteTrain._aws_s3_path(s3_bucket) boto_sess = RemoteTrain._boto_session(id, secret) logger.info('Getting algorithm image URI...') container = get_image_uri(boto_sess.region_name, 'xgboost', repo_version='0.90-1') logger.info('Creating sagemaker session...') sage_sess = sagemaker.Session(boto_sess) s3_input_train, s3_input_val = self.load_data(s3_path) logger.info( 'Creating sagemaker estimator to train using the supplied {} model...' .format(self.model_name)) if self.model_name == 'clf': train_instance_type = 'ml.m5.4xlarge' else: train_instance_type = 'ml.m5.2xlarge' est = Estimator(container, role=self.role, train_instance_count=1, train_instance_type=train_instance_type, output_path=s3_path + 'model_' + self.model_name + '/', sagemaker_session=sage_sess, base_job_name=self.model_name + '-job') logger.info('Setting hyper-parameters...') est.set_hyperparameters(**self.params) if self.model_name == 'clf': est.set_hyperparameters( scale_pos_weight=self._get_imb_ratio()['imb_ratio']) if est.hyperparam_dict is None: raise ValueError('Hyper-parameters are missing') else: logger.info(est.hyperparam_dict) sw = Stopwatch(start=True) est.fit({'train': s3_input_train, 'validation': s3_input_val}) # The following method is inconsistent with newer version of xgboost try: est.training_job_analytics.export_csv( get_model_dir(self.model_name + '_aws_metrics.csv')) except: pass logger.info('Elapsed time of training: {}'.format( sw.elapsed.human_str())) job_name = est.latest_training_job.job_name self.dump_model(boto_sess, s3_bucket, job_name) self.extract_model() self._validation()
def extract_transform_load(self, brx_threshold=0.01, trx_threshold=0.01, ext=True): sw = Stopwatch(start=True) trx_pt = pd.DataFrame() brx_pt = pd.DataFrame() ads_pt = pd.DataFrame() logger.info('Extracting invoice data: {} to {}...'.format( self.start_po, self.end_po)) DataRet(self.start_po, self.end_po).invoice_ext(table_id='_invoices_po') if self.calib: logger.info('Extracting training data: {} to {}...'.format( self.start_pt, self.end_pt)) DataRet(self.start_pt, self.end_pt).invoice_ext(table_id='_invoices') trx_prep = TrxPrep(self.start_pt, self.end_pt, trx_threshold, ext=ext) trx_pt = trx_prep.trx_data_prep() brx_prep = BrxPrep(self.start_pt, self.end_pt, brx_threshold, ext=ext, non_adj=self.non_adj) brx_pt, ads_pt, _ = brx_prep.brx_data_prep() logger.info('Extracting observation data: {} to {}...'.format( self.start_po, self.end_po)) trx_prep = TrxPrep(self.start_po, self.end_po, trx_threshold, ext=False) trx_po = trx_prep.trx_data_prep() brx_prep = BrxPrep(self.start_po, self.end_po, brx_threshold, ext=False, non_adj=self.non_adj) brx_po, ads_po, date_po = brx_prep.brx_data_prep() data_prep = DataPrep(trx_pt, brx_pt, ads_pt, trx_po, brx_po, ads_po, date_po, test_size=0.20, aws_env=self.aws_env) clf_train, clf_val, reg_train, reg_val, X_pred = data_prep.prep( self.calib) logger.info('Elapsed time of ETL job: {}'.format( sw.elapsed.human_str())) return clf_train, clf_val, reg_train, reg_val, X_pred