def __init__(self, model_deciding=None): get_time() self.data_path = data_path if model_deciding == 'all' else features_data_path self.data = get_data(path=self.data_path, is_for_model=False) self.columns = list(self.data.columns) self.features = decide_feature_name(feature_path) self.model_deciding = model_deciding
def model_train(self): print("Auto Encoder is initialized!!") get_time() self.train_test_split() self.get_x_values(is_for_prediction=False) self.auto_encoder() if len(self.gpu_devices) != 0: with tf.Session() as sess: sess.run(tf.global_variables_initializer()) config = tf.ConfigProto(log_device_placement=True) self.model_ae.fit(self.X, self.X, epochs=int(self.params['epochs']), batch_size=int(self.params['batch_size']), validation_split=0.2, shuffle=True) else: self.model_ae.fit(self.X, self.X, epochs=int(self.params['epochs']), batch_size=int(self.params['batch_size']), validation_split=0.2, shuffle=True) self.model_from_to_json(auto_encoder_model_paths['ae'], self.model_ae, is_writing=True)
def __init__(self, hyper_parameters=None, model_deciding=None, last_day_predictor=None, params=None): get_time() self.data = get_data(main_data_path + params['args']['data'], True) # data that created at feature engineering self.features = list( decide_feature_name(main_data_path + params['args']['feature_set']).keys()) self.params = hyper_parameters # get hyper parameters for model: hyper_parameters.json self.model_params = params self.train, self.test = None, None self.X = None self.optimum_cluster_centroids = None self.centroids = None self.po_data = None # Possible_outlier_transactions data self.model_dbscan = None self.m_s, self.eps = [], [] self.o_min_sample = None self.o_epsilon = None self.o_devision = None self.last_day_predictor = last_day_predictor # splitting data indicator self.uids = None
def prediction_iso_f(self): print("Isolation Forest Prediction Process Initialized!") get_time() self.train_test_split() self.model_iso = self.model_from_to_pickle(is_writing=False) self.get_x_values(is_for_prediction=True) self.model_iso.n_jobs = -1 self.test[self.model_params['args'] ['pred_field']] = self.model_iso.predict(self.X) print("Isolation Forest Prediction Process Done!")
def main(args): logger.get_time() if is_local_run: args = sample_args sys.stdout = logger.Logger() print("*"*3, " args :", args) if len(args) != 0: if (args[1]) == 'feature_engineering': """ run from terminal: python main.py feature_engineering all all: create all features which are at features.json Ex: 'python main.py feature_engineering c_m_ratios' create only 'c_m_ratios' adds to features set. """ create_feature = CreateFeatures(model_deciding=args[2]) create_feature.compute_features() if (args[1]) == 'train_process': # TODO: description must be updated """ run from terminal: python main.py train_process 0 0/1: 0; test data splits from date 1: test data is last day of each customer Models: isolation forest and AutoEncoder for Multivariate and Univariate Models """ train = trainModel(args=args) train.process() if sys.argv[1] == 'prediction': # TODO: description must be updated """ run from terminal: python main.py prediction 0 0/1: 0; test data splits from date 1: test data is last day of each customer It creates prediction values for each transaction is added to raw data set """ prediction = trainModel(args=args, is_prediction=True) prediction.process() if args[1] == 'dashboard': # TODO: description must be updated """ run from terminal: python main.py dashboard 0 # 10.20.10.196:3030 run from terminal: python main.py dashboard 0 uni # 10.20.10.196:3031 0/1: 0; test data splits from date 1: test data is last day of each customer uni: creates only for univariate models. I order to run for multivariate dashboard assign null Dashboard for Multi - Uni Models is created """ # TODO: get prediction data from predicted .csv file model = ModelTrainIsolationForest(last_day_predictor=int(args[2])) model.train_test_split() create_dahboard(model.train, get_data(pred_data_path, True)) logger.get_time()
def learning_process_prediction_ext_iso_f(self): print("Extended isolation forest train process is initialized!!") get_time() self.get_x_values() self.model_e_iso_f = iso.iForest( self.X, ntrees=self.params['num_of_trees'], sample_size=self.params['sample_size'], ExtensionLevel=len(self.features) - 1) self.data[self.model_params['args'] ['pred_field']] = self.model_e_iso_f.compute_paths( X_in=self.X) self.train_test_split() print("Extended Isolation Forest Model Train Process Done!")
def __init__(self, hyper_parameters=None, last_day_predictor=None, params=None): get_time() self.data = get_data(features_data_path, True) # data that created at feature engineering # TODO: get specific feature from specific model. self.features = list(decide_feature_name(feature_path).keys()) self.params = hyper_parameters # get hyper parameters for model: hyper_parameters.json self.model_params = params self.train, self.test = None, None self.X = None self.model_e_iso_f = None self.last_day_predictor = last_day_predictor # splitting data indicator
def learning_process_iso_f(self): print("isolation forest train process is initialized!!") get_time() self.train_test_split() self.get_x_values(is_for_prediction=False) self.model_iso = IsolationForest( n_estimators=self.params['num_of_trees'], max_samples='auto', contamination=self.params['contamination'], bootstrap=False, n_jobs=-1, random_state=42, verbose=1).fit(self.X) self.model_from_to_pickle(True) print("Isolation Forest Model Train Process Done!")
def insert_image_record(*args, **kwargs): db_connection = connect_to_database() db = db_connection.cursor() name = kwargs.get('name') image_location = kwargs.get('image_location') thumb_location = kwargs.get('thumb_location') date_added = get_time() date_taken = kwargs.get('date_taken') caption = kwargs.get('caption') width = kwargs.get('width') height = kwargs.get('height') try: db.execute('INSERT INTO images VALUES (?,?,?,?,?,?,?,?,?)', (None, name, image_location, thumb_location, date_added, date_taken, caption, width, height) ) last_row = db.lastrowid db_connection.commit() db_connection.close() return(last_row) except Exception, err: tryToCloseDB(db_connection) for error in err: log("Database: Unable to insert image record - " + str(error), "DATABASE","MEDIUM")
def compute_features(self): print("*" * 20, "Feature Engineering Process", "*" * 20) get_time() self.deciding_computing_features() self.features_data_arrange() for f in self.features: print("Feature :", f) if self.features[f]['args']['num_of_transaction_removing']: self.data = self.features[f]['args']['noisy_data_remover']( self.data, self.features[f]['args']['num_of_transaction_removing'], self.features[f]['args']['num_of_days_removing'], ) self.data = self.features[f]['calling'](self.data, f) print("data sample size :", len(self.data)) self.assign_last_day_label() write_to_csv(self.data, features_data_path) print("*" * 20, "Feature Engineering Process Has Done", "*" * 20)
def __init__(self, hyper_parameters=None, last_day_predictor=None, params=None): get_time() self.data = get_data(features_data_path, True) self.features = list(decide_feature_name(feature_path).keys()) self.params = hyper_parameters self.last_day_predictor = last_day_predictor self.model_params = params self.train, self.test = None, None self.X, self.y_pred, self.y = None, None, None self.input, self.fr_output = None, None self.model_ae, self.model_ae_l, self.model_u = None, None, None self.gpu_devices = [ d for d in device_lib.list_local_devices() if d.device_type == "GPU" ] if run_gpu else []
def compute_features(self): get_time() self.features_data_arrange() for f in self.features: print("Feature :", f) self.check_features_existed( self.features[f]['args']['feature'], self.features[f]['args']['related_columns']) if self.features[f]['args']['num_of_transaction_removing']: self.data = self.features[f]['args']['noisy_data_remover']( self.data, self.features[f]['args']['num_of_transaction_removing'], self.features[f]['args']['num_of_days_removing'], ) self.data = self.features[f]['calling'](self.data, f) self.labeling_anormalities(f) print("data sample size :", len(self.data)) self.assign_target_variable() self.assign_last_day_label() write_to_csv(self.data, features_data_path)
def process(self): print("*" * 20, "Train Process", "*" * 20) if not self.is_pred else print( "*" * 20, "Prediction Process", "*" * 20) get_time() self.define_train_args() for m in self.models: if self.models[m]['args']['py_file'] in self.files: if self.params['run_model'] == 'all' or self.params[ 'run_model'] == m: print("Model :", self.models[m]['name']) _file_path = join(dirname(__file__), self.models[m]['args']['py_file']) model_py = callfunc(_file_path) model = [ o[1] for o in getmembers(model_py) if o[0] == self.models[m]['args']['calling'] ][0] model = model( hyper_parameters=self.hyper_parameters[m], last_day_predictor=self. params['is_training_with_c_of_last_transactions'], params=self.models[m]) _module = self.models[m]['args'][ 'prediction'] if self.is_pred else self.models[m][ 'args']['train'] model_process = [ o[1] for o in getmembers(model) if o[0] == _module ][0] model_process() if self.is_pred: # if it is on prediction env. concat outputs in prediction_data self.get_pred_concat(m, model.test) else: print("Pls add .py file for model :", m) if self.is_pred: # import data to merged prediction data self.pred_data.to_csv(pred_data_path, index=False)
import sys from feature_engineering import CreateFeatures from configs import is_local_run, sample_args, pred_data_path from model_train_iso_f import ModelTrainIsolationForest from dashboard import create_dahboard import logger from data_access import model_from_to_json, get_data from model_processor import trainModel if __name__ == "__main__": logger.get_time() if is_local_run: sys.argv = sample_args sys.stdout = logger.Logger() print("*" * 3, " args :", sys.argv) if len(sys.argv) != 0: if sys.argv[1] == 'feature_engineering': """ run from terminal: python main.py feature_engineering all all: create all features which are at features.json Ex: 'python main.py feature_engineering c_m_ratios' create only 'c_m_ratios' adds to features set. """ create_feature = CreateFeatures(model_deciding=sys.argv[2]) create_feature.compute_features() if sys.argv[1] == 'train_process': """ run from terminal: python main.py train_process 0 0/1: 0; test data splits from date 1: test data is last day of each customer
def create_test_data(): db_connection = connect_to_database() db = db_connection.cursor() isave = locations.image_save_location() tsave = locations.thumbnail_save_location() #Buid data string to insert imageData = [ (None, 'Callie Hanging out', os.path.join(isave, '1.jpg'), os.path.join(tsave, '1.jpg'),get_time(), get_time(), "Callie hanging out"), (None, 'Callie Christmas', os.path.join(isave, '2.jpg'), os.path.join(tsave, '2.jpg'),get_time(), get_time(), ""), (None, 'Boop', os.path.join(isave, '3.jpg'), os.path.join(tsave, '3.jpg'),get_time(), get_time(), ""), (None, 'Squeak', os.path.join(isave, '4.jpg'), os.path.join(tsave, '4.jpg'),get_time(), get_time(), ""), ] tagData = [ (None, 0, "Kids"), (None, 0, "Josh"), (None, 0, "Linz"), (None, 0, "Family"), (None, 0, "Holidays"), (None, 0, "Friends") ] subTagData = [ (None, 0, "Kids", "Callie"), (None, 0, "Josh", "Portraits"), (None, 0, "Josh", "With Callie"), (None, 0, "Josh", "Photography"), (None, 0, "Josh", "Birthdays"), (None, 0, "Linz", "Portraits"), (None, 0, "Linz", "With Callie"), (None, 0, "Linz", "Photography"), (None, 0, "Linz", "Birthdays"), (None, 0, "Family", "Johnson"), (None, 0, "Family", "Zamudio"), (None, 0, "Family", "Brownell"), (None, 0, "Family", "Murello"), (None, 0, "Family", "Williams"), (None, 0, "Family", "Puppies"), (None, 0, "Holidays", "Christmas"), (None, 0, "Holidays", "New Years"), (None, 0, "Holidays", "Easter"), (None, 0, "Holidays", "Valentines Day"), (None, 0, "Holidays", "Dragon Day"), (None, 0, "Holidays", "Thanksgiving"), (None, 0, "Holidays", "4th of July"), (None, 0, "Holidays", "Halloween"), (None, 0, "Friends", "Childhood"), (None, 0, "Friends", "Adult"), ] eventTagData = [ (None, 0, "Kids", "Callie", "Growing Girl"), (None, 0, "Kids", "Callie", "Birthdays"), (None, 0, "Kids", "Callie", "Silly"), (None, 0, "Kids", "Callie", "Portraits"), (None, 0, "Family", "Zamudio", "Brandon and Dakota"), (None, 0, "Family", "Zamudio", "Roger"), (None, 0, "Family", "Zamudio", "Richie and Angie"), (None, 0, "Family", "Johnson", "I I I"), (None, 0, "Family", "Johnson", "Larry and Annette"), (None, 0, "Family", "Johnson", "Misty"), (None, 0, "Family", "Johnson", "Anthony and Latonya"), (None, 0, "Family", "Williams", "Teresa"), (None, 0, "Family", "Brownell", "Christine"), (None, 0, "Holidays", "Halloween", "2013"), (None, 0, "Holidays", "Halloween", "2014"), (None, 0, "Holidays", "Christmas", "2013"), (None, 0, "Holidays", "Christmas", "2014"), (None, 0, "Holidays", "New Years", "2013"), (None, 0, "Holidays", "New Years", "2014"), (None, 0, "Holidays", "Easter", "2013"), (None, 0, "Holidays", "Easter", "2014"), (None, 0, "Holidays", "Valentines Day", "2013"), (None, 0, "Holidays", "Valentines Day", "2014"), (None, 0, "Holidays", "Dragon Day", "2013"), (None, 0, "Holidays", "Dragon Day", "2014"), (None, 0, "Holidays", "Thanksgiving", "2013"), (None, 0, "Holidays", "Thanksgiving", "2014"), (None, 0, "Holidays", "4th of July", "2013"), (None, 0, "Holidays", "4th of July", "2014") ] alertData = [ (None, "Outage Coming up on the 4th", "active", time.time(), time.time() + 2), (None, "Callies Birthday Coming up!!", "active", time.time(), time.time() + 0.5) ] #db.executemany('INSERT INTO images VALUES (?,?,?,?,?,?,?)', imageData) db.executemany('INSERT INTO tags VALUES (?,?,?)', tagData) db.executemany('INSERT INTO sub_tags VALUES (?,?,?,?)', subTagData) db.executemany('INSERT INTO event_tags VALUES (?,?,?,?,?)', eventTagData) #db.executemany('INSERT INTO alerts VALUES (?,?,?,?,?)', alertData) try: db_connection.commit() db_connection.close() log("DataBase: Inserted test data into db", "DATABASE", "INFO") except Exception, err: tryToCloseDB(db_connection) for error in err: log("DataBase: Unable to insert test data - " + str(error), "DATABASE","INFO")
def learning_process_dbscan(self): print("DBSCAN train process is initialized!!") get_time() print("KMeans Finding Best Centroids process is started!!") self.find_optimum_centroids_with_kmeans() print("Parameter Tuning For Epsilon and Min_Samples!!") self.optimum_min_samples() self.optimum_epsilon() print("number of data for DBSCAN :", len(self.po_data)) print({ 'eps': self.o_epsilon, 'min_samples': self.o_min_sample, 'centroids': {c for c in self.centroids} }) print("Optimum Centriod Divison is Initialized!!!") cal_divs = [] for div in range(2, self.params['centroid_divide_range']): print("divide :", div) self.get_x_values(div) print(len(self.po_data) - self.o_min_sample) self.po_data['label_dbscan'] = DBSCAN( eps=self.o_epsilon, min_samples=len(self.po_data) - self.o_min_sample, n_jobs=-1).fit(self.X).labels_ cal_divs.append({ "cal": np.mean( np.abs( np.sum([ self.get_distance_of_outliers( "label_dbscan != -1", max), np.multiply( self.get_distance_of_outliers( "label_dbscan == -1", min), -1) ]))), "div": div }) print("optimum centriod distance to outliers results :") print(cal_divs) self.o_devision = list( pd.DataFrame(cal_divs).sort_values(by='cal', ascending=False)['div'])[0] print("optimum ", self.o_devision) print({ 'eps': self.o_epsilon, 'min_samples': self.o_min_sample, 'centroids': {c for c in self.centroids}, "div": self.o_devision }) model_from_to_json( main_data_path + self.model_params['args']['model_file'], { 'eps': self.o_epsilon, 'min_samples': self.o_min_sample, 'centroids': {c[0]: c[1] for c in zip(self.features, self.centroids)}, 'optimum_divison': self.o_devision }, True) print("DBSCAN Train Process Done!")