def get_connection(self): config = conf('config') if self.data_source in ['postgresql', 'awsredshift', 'mysql']: server, db, user, pw, port = str(config['db_connection']['server']), str(config['db_connection']['db']), \ str(config['db_connection']['user']), str(config['db_connection']['password']),\ int(config['db_connection']['port']) if self.data_source == 'mysql': from mysql import connector self.conn = connector.connect(host=server, database=db, user=user, password=pw) if self.data_source in ['postgresql', 'awsredshift']: import psycopg2 self.conn = psycopg2.connect(user=user, password=pw, host=server, port=port, database=db) if self.data_source == 'googlebigquery': from google.cloud.bigquery.client import Client os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = join( conf('data_main_path'), "", config['db_connection']['db']) self.conn = Client() print("db connection is done!")
def down_web(self): try: from configs import conf except Exception as e: from .configs import conf request_url(url='http://' + conf('web_host') + ':' + str(conf('web_port')) + '/shutdown')
def job_init(): exception = '' req = dict(request.form) jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') process = read_yaml(conf('log_main_path'), 'process.yaml') job_names = list(jobs.keys()) dates = {j: jobs[j]['job_start_date'] for j in job_names} current_active_status = {j: jobs[j]['active'] for j in job_names} log_infos = get_logs(job_names, dates, current_active_status, process) log_infos = update_logs(jobs, log_infos, req) return render_template( "ml_execute.html", train_process=log_infos['train']['process'], prediction_process=log_infos['prediction']['process'], tuning_process=log_infos['parameter_tuning']['process'], train_status=log_infos['train']['status'], prediction_status=log_infos['prediction']['status'], tuning_status=log_infos['parameter_tuning']['status'], train_precent=str(log_infos['train']['percent']), prediction_precent=str(log_infos['prediction']['percent']), tuning_percent=str(log_infos['parameter_tuning']['percent']), train_date=str(log_infos['train']['start_time'])[0:16], prediction_date=str(log_infos['prediction']['start_time'])[0:16], tuning_date=str(log_infos['parameter_tuning']['start_time'])[0:16], exception=exception)
def show_dash(): jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') connection = check_available_data_for_dashboard(jobs) return render_template('dashboard.html', connection=connection, dash_url="http://" + conf('web_host') + ":" + str(conf('config')['web_port']) + "/dash/")
def start_job(job): Logger('ml_execute_' + job) print("received :", { 'job': job, 'process': 'start' }, " time :", get_time()) j = CreateJobs(read_yaml(conf('docs_main_path'), 'ml_execute.yaml'), job) if j.job['day'] == 'only once': j.job_that_executes_once() return 'done!!!!' if j.job['day'] in ['Monthly', 'Every 2 Weeks']: while True: jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') j.job_that_executes_monthly_weekly() print("job is working - ", job) if jobs[job]['active'] is False: print("job is stopped !!") break time.sleep(60) elif j.job['day'] not in ['only once', 'Monthly', 'Every 2 Weeks']: j.job_schedule() while True: schedule.run_pending() jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') print("job is working - ", job) if jobs[job]['active'] is False: print("job is stopped !!") break time.sleep(10)
def start_job_and_update_job_active(jobs, job): jobs[job]['active'] = True write_yaml(conf('docs_main_path'), "ml_execute.yaml", jobs) ml_execute_api = read_yaml(conf('docs_main_path'), 'apis.yaml')['ml_execute'] url = get_api_url(ml_execute_api['host'], ml_execute_api['port'], ml_execute_api['api_name']) request_url(url, {'job': job})
def get_results(date_col): results = [] for f in listdir(dirname(join(conf('data_main_path'), ""))): f_splits = f.split(conf('result_file')) if f_splits[0] == "": results += pd.read_csv(join(conf('data_main_path'), "", f)).to_dict('results') results = pd.DataFrame(results) if len(results) >= 1000: results = results.sort_values(by=date_col, ascending=True)[-1000:] return results
def home(): jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') configs = read_yaml(conf('docs_main_path'), 'configs.yaml') model_configuration = read_yaml(conf('model_main_path'), 'model_configuration.yaml') process = read_yaml(conf('log_main_path'), 'process.yaml') req = dict(request.form) if 'save' in req.keys(): if bool(req['save']): ml_execute_reset(jobs) db_connection_reset(configs) models_reset(model_configuration) logs_reset(process) return render_template("home.html", reset_script=reset_script)
def get_time(): try: print("browser time: ", request.args['time']) print("server time : ", time.strftime('%A %B, %d %Y %H:%M:%S')) jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') for j in jobs: jobs[j]['browser_time'] = str( datetime.datetime.strptime( " ".join(request.args['time'].split()[0:5]), "%a %b %d %Y %H:%M:%S"))[0:13] write_yaml(conf('docs_main_path'), "ml_execute.yaml", jobs) except Exception as e: print(e) return "Done"
def get_filters(data): jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') #model_conf = read_yaml(conf('model_main_path'), 'model_configuration.yaml') model_infos = jobs[list(jobs.keys())[0]]['execute'][0]['params'] groups, date_col, feature = split_groups(model_infos['groups']), model_infos['time_indicator'], model_infos['feature'] #t_dimensions = model_conf['infos']['time_groups'].split("*") #data = check_for_time_part_groups_on_data(data, t_dimensions, date_col) #groups += t_dimensions if groups not in ['None', None, []]: if len(groups) > 3: groups = random.sample(groups, 3) else: groups = list(set(list(data.columns)) - set([date_col, feature, 'Unnamed: 0.1', 'Unnamed: 0'])) if groups >= 4: groups = list(filter(lambda col: type(col) == str, groups)) if len(groups) >= 4: groups = random.sample(groups, 3) if len(groups) > 3 else [date_col] print("filters :", groups) num_f_p = len(groups) filter_datas = [] for g in groups: filter_datas.append(list(data[data[g] == data[g]][g].unique()) + ['ALL']) filter_ids = groups filter_sizes = [30] * num_f_p multiple_selection = [False] * num_f_p values = ['ALL'] * num_f_p filters = list(zip(filter_ids, filter_datas, filter_sizes, multiple_selection, values)) hover_data = [{date_col: min(data[model_infos['time_indicator']])}] * 3 return num_f_p, filters, hover_data, groups, filter_ids, date_col, feature, data
def ml_execute_update(**update_dict): keys = [ 'jobs', 'description', 'data_query_path', 'data_source', 'groups', 'dates', 'data_query_path', 'data_source', 'groups', 'dates', 'time_indicator', 'feature', 'description', 'days' ] infos = {k: update_dict.get(k, None) for k in keys} jobs = infos['jobs'] for j in jobs: jobs[j]['description'] = infos['description'] jobs[j]['day'] = infos['days'][j] if infos['days'] else None jobs[j]['job_start_date'] = str( infos['dates'][j][0])[0:16] if infos['dates'] else None jobs[j]['job_end_date'] = None if not infos[ 'dates'] else None if infos['dates'][j][1] else str( infos['dates'][j][1])[0:16] e2 = [] for e in jobs[j]['execute']: for p in e['params']: if p in infos.keys(): e['params'][p] = str(infos[p]) if p == 'time_period': e['params'][ p] = infos['days'][j] if infos['days'] else None e2.append(e) jobs[j]['execute'] = e2 print("ml_execute.yaml is updated!!") write_yaml(conf('docs_main_path'), "ml_execute.yaml", jobs)
def db_connection_update(**args): configs = read_yaml(conf('docs_main_path'), 'configs.yaml') configs['db_connection']['data_source'] = args['data_source'] configs['db_connection']['is_from_db'] = False if args['data_source'] in [ 'csv', 'json', 'pickle' ] else True infos = { 'db': args.get('db_name', None), 'password': args.get('pw', None), 'port': args.get('port', None), 'server': args.get('host', None), 'user': args.get('user', None) } for i in infos: configs['db_connection'][i] = infos[i] write_yaml(conf('docs_main_path'), "configs.yaml", configs)
def create_task(): exception = "" job = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') params = get_model_arguments(job) data, cols, connection = get_sample_data(params, connection=True, create_sample_data=False) if bool(request.args['messages'] ) and params['data_source'] and connection: if dict(request.form) != {}: update_dict = get_request_values(job, params, request) if update_dict['feature'] and update_dict[ 'time_indicator'] and dict( request.form)['date1_prediction'] != '': ml_execute_update(**update_dict) else: exception = "Pls make sure you have entered Anomaly Feature and Date Indicator!!!!" if get_dash(request): return redirect(url_for('show_dash')) else: return render_template("configs_data2.html", cols=cols, exception=exception) else: return render_template("configs_data2.html", cols=cols, exception=exception) else: return redirect(url_for('get_data'))
def date_dimension_deciding(self): if self.job != 'prediction': self.calculate_date_parts() info = {'infos': {'min_date': str(min(list(self.data[self.time_indicator])))[0:19], 'max_date': str(max(list(self.data[self.time_indicator])))[0:19], 'time_groups': "*".join(self.time_groups)} } write_yaml(conf('model_main_path'), 'model_configuration.yaml', info) else: self.time_groups = read_yaml(conf('model_main_path'), "model_configuration.yaml")['infos'][ 'time_groups'].split("*") for t_dimension in self.time_groups: if t_dimension not in self.groups: self.data[t_dimension] = self.data[self.date].apply(lambda x: date_part(x, t_dimension)) if self.time_groups != ['']: self.groups += self.time_groups print("time parts : ", self.time_groups)
def save_model_configurations(job, data, time_indicator, time_groups): info = {} if job == 'prediction': info = {'infos': {'min_date': min(list(data[time_indicator])), 'max_date': max(list(data[time_indicator])), 'time_groups': time_groups} } with open(join(conf('model_main_path'), "model_configuration.yaml"), 'w') as file: yaml.dump(info, file)
def query_data_source(self): self.check_data_with_filtering() # import data via pandas if self.data_source in ['mysql', 'postgresql', 'awsredshift']: self.get_connection() self.data = pd.read_sql( self.query + " LIMIT " + str(self.nrows) if self.nrows else self.query, self.conn) # import data via google if self.data_source == 'googlebigquery': self.get_connection() self.data = self.conn.query( self.query + " LIMIT " + str(self.nrows) if self.nrows else self.query).to_dataframe() # import via pandas if self.data_source == 'csv': try: for sep in [',', ';', ':']: self.data = pd.read_csv(filepath_or_buffer=join( conf('data_main_path'), self.data_query_path), error_bad_lines=False, encoding="ISO-8859-1", sep=sep, nrows=self.nrows) if len(self.data.columns) > 1: break except Exception as e: print(e) if self.data_source == 'json': self.data = read_write_to_json(conf('directory'), self.data_query_path, None, is_writing=False) if self.data_source == 'yaml': self.data = read_yaml(conf('data_main_path'), self.data_query_path) if self.data_source in ('json', 'yaml', 'csv'): self.data = self.query(self.data)
def anomaly_prediction(self): self.model = model_from_to_json( path=join(conf('model_main_path'), model_path(self.comb, self.groups, 'lstm'))) self.prediction = self.scale.inverse_transform( self.model.predict(self.prediction)) self.result = self.result[(len(self.result) - len(self.prediction)):] self.result['predict'] = self.prediction.reshape( 1, len(self.prediction)).tolist()[0]
def get_reset_script(): jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') configs = read_yaml(conf('docs_main_path'), 'configs.yaml') reset_script = "" if configs['db_connection']['data_source'] not in ['', None]: reset_script += "You have a data source connction form " + configs[ 'db_connection']['data_source'] + "." active_jobs = [] for j in jobs: if jobs[j]['active'] is True: active_jobs.append([j, jobs[j]['day']]) if len(active_jobs) == 1: reset_script += " You also have active job which is " + j[ 0] + " running " + j[1] + "." if len(active_jobs) > 2: reset_script += " You also have active job which are " + ", ".join( [j[0] for j in active_jobs]) + " running " + ", ".join( [j[1] for j in active_jobs]) + ". " reset_script += " Would you like to reset them all?" return reset_script
def parameter_tuning(self): if len(self.levels) == 0: self.optimized_parameters = self.parameter_tuning_threading( has_comb=False) else: for self.comb in self.levels: self.optimized_parameters[ self.get_param_key()] = self.parameter_tuning_threading() if not check_request_stoped(self.job): break print("updating model parameters") pt_config = read_yaml(conf('docs_main_path'), 'parameter_tunning.yaml') pt_config['has_param_tuning_first_run']['lstm'] = True _key = 'hyper_parameters' if len( self.levels) == 0 else 'combination_params' pt_config[_key]['lstm'] = self.optimized_parameters write_yaml(conf('docs_main_path'), "parameter_tunning.yaml", pt_config, ignoring_aliases=True) self.params = hyper_conf('lstm') self.combination_params = hyper_conf('lstm_cp')
def __init__(self, jobs, job_name): self.job_name = job_name self.jobs_yaml = jobs self.job = self.jobs_yaml[job_name] self.api_infos = read_yaml(conf('docs_main_path'), 'apis.yaml') self.api_info = None self.url = None self.logger = LoggerProcess(job=self.job_name) self.browser_time, self.diff, self.start_time, self.time = ml_execute_times( self.job) self.schedule = None self.total_minutes_in_month = 30 * 24 * 60 self.total_minutes_in_2_weeks = 15 * 24 * 60
def stop_job(self, request=True): if self.job[ 'active'] is True: # if there is active job update ml_execute.yaml self.logger.regenerate_file() self.jobs_yaml[self.job_name]['active'] = False write_yaml(conf('docs_main_path'), "ml_execute.yaml", self.jobs_yaml) for j in self.job['execute']: self.api_info = self.api_infos['model_' + j['params']['model']] self.url = get_api_url(host=self.api_info['host'], port=self.api_info['port'], api_name=self.api_info['api_name']) if request: request_url(self.url, self.job['stop_job'])
def train_model(self, save_model=True): self.model = IsolationForest(n_estimators=self._p['num_of_trees'], max_samples='auto', contamination=self._p['contamination'], bootstrap=False, n_jobs=-1, random_state=42, verbose=1).fit(self.train[[self.feature ]].values) if save_model: model_from_to_pkl(directory=conf('model_main_path'), path=model_path(self.comb, self.groups, 'iso_f'), model=self.model, is_writing=True)
def run_platform(self): self.platform = BuildPlatform(conf=self.conf, environment=self.env, master_node=self.conf.master_node) self.platform.initialize() if self.conf.master_node: try: from configs import conf except Exception as e: from .configs import conf self.web_port = conf('web_port') self.web_host = conf('web_host') print("platform is up!!!") print("*" * 5, " WEB APPLICATION ", "*" * 5) print('Running on ', 'http://' + self.web_host + ':' + str(self.web_port) + '/') else: print("platform is up!!!") print("Running Services:") for api in self.platform.api_file: print( api, " :", 'http://' + api['host'] + ':' + api['port'] + '/' + api['api_name'] + '/')
def learning_process(self, save_model=True): self.model.fit(self.train['x_train'], self.train['y_train'], batch_size=self._p['batch_size'], epochs=self._p['epochs'], verbose=0, validation_data=(self.train['x_test'], self.train['y_test']), shuffle=False) if save_model: model_from_to_json(path=join( conf('model_main_path'), model_path(self.comb, self.groups, 'lstm')), model=self.model, is_writing=True)
def ml_execute_reset(jobs): for j in jobs: jobs[j]['description'] = None jobs[j]['day'] = None jobs[j]['job_start_date'] = None jobs[j]['job_end_date'] = None e2 = [] for e in jobs[j]['execute']: for p in e['params']: if p not in ['model', 'job']: e['params'][p] = None e2.append(e) jobs[j]['execute'] = e2 print("reset ml-execute.yaml !!!") write_yaml(conf('docs_main_path'), "ml_execute.yaml", jobs)
def data_source(): jobs = read_yaml(conf('docs_main_path'), 'ml_execute.yaml') model_infos = jobs[list(jobs.keys())[0]]['execute'][0]['params'] try: source = GetData(data_query_path="sample_data.csv", data_source="csv", time_indicator=model_infos['time_indicator'], feature=model_infos['feature'], test=1000) source.query_data_source() source.convert_feature() data = source.data except Exception as e: data = pd.DataFrame() print("no data is available") return data
def update_api_file(self, apis=None): try: from configs import conf except Exception as e: from .configs import conf self.api_file = read_yaml(conf('docs_main_path'), "apis.yaml") if apis is not None: if not self.master_node: self.api_file = {a: self.api_file[a] for a in apis} if type(apis) == dict: for a in apis: for p in apis[a]: self.api_file[a][p] = apis[a][p] else: self.api_file = self.api_file[apis]
def get_sample_data(params, connection, create_sample_data=True): data, cols = None, None try: sample_size = 10 if not create_sample_data else 1000 d = GetData(data_query_path=params['data_query_path'], data_source=params['data_source'], test=sample_size) d.query_data_source() cols = d.data.columns.values # data = d.data.to_html(classes=["table table-bordered table-striped table-hover table-sm"]) if create_sample_data: d.data.to_csv(join(conf('data_main_path'), 'sample_data.csv')) except Exception as e: print(e) connection = False return data, cols, connection
def prediction_execute(self): for self.comb in self.levels: print( "*" * 4, "ISO FOREST - ", self.get_query().replace(" and ", "; ").replace(" == ", " - "), "*" * 4) if check_model_exists(model_path(self.comb, self.groups, 'iso_f'), conf('model_main_path')): self.f_w_data = self.data.query( self.get_query()).sort_values(by=self.date) self.split_data(is_prediction=True) print("prediction size :", len(self.prediction)) self.detect_anomalies() self.logger.counter() if not check_request_stoped(self.job): break self.anomaly = DataFrame(self.anomaly)
def check_for_ports(self, service_count): """ checks for available ports. It picks prosts from the range between 6000 - 7000. :param service_count: number of service which cpecifically assigned for this configuration. By defaults finds available port for each services. """ try: from configs import conf except Exception as e: from .configs import conf if self.cd.check_for_directory(): count = 0 available_ports = conf('available_ports') while len(self.ports) != service_count: if not is_port_in_use(available_ports[count]): self.ports.append(int(available_ports[count])) count += 1