def __init__(self, subject, targets, time_cols, source_db, result_db): self.subject = subject self.targets = targets self.time_cols = time_cols self.source_db = DataBase(source_db) self.result_db = DataBase(result_db) self.scaler = StandardScaler() self.modelers = {} self.model_path = './models/clust/'
def upload_simple_data(): db = DataBase('BaseDB.db') res = {} for sub in subjects: sub_df = db.read(sub + '_source').get_data()[-150:] #print(sub_df.columns) for target in targets: res[sub + '_' + target] = sub_df[['Время', target]] res[sub + '_' + target].columns = ['time', 'actual'] #print(res[sub+'_'+target]) return res
def upload_clust_data(): db = DataBase('BaseDB.db') res = {} for sub in subjects: #Необходимо отрезать первые 10, res[sub] = db.read('Analitics_' + 'clust_' + sub).get_data()[-150:] print('Clust ', sub) print("@@@") #print(res[sub][-100:-50]) print(res[sub][:50]) print('|||') print(res[sub][-50:]) return res
def __init__(self, subject, targets, time_cols, source_db, result_db, n_predict, volume, interval): self.subject = subject self.targets = targets self.time_cols = time_cols self.n_predict = n_predict self.volume = volume self.interval = interval self.source_db = DataBase(source_db) self.result_db = DataBase(result_db) self.modelers = {} self.models_path = './models/separated/model_' + self.subject + '_'
def upload_separate_data(): db = DataBase('BaseDB.db') res = {} for sub in subjects: for target in targets: if (db.get_table_length('Analitics_' + 'sep_' + sub + '_' + target) == -1): print('Table ' + 'Analitics_' + 'sep_' + sub + '_' + target + ' doen`t exists yet!') else: res[sub + '_' + target] = db.read('Analitics_' + 'sep_' + sub + '_' + target).get_data()[-150:] print(res[sub + '_' + target]) return res
def __init__(self, destination_db, use_cols, data_path, time_col, cols_to_average, entities_to_use, group_by, avg_window='300S', decode_UNIX_time=False, prepare_time=False, trig_prepare_time=False, mode='load', from_outer=False, time_to_proc=None): self.destination_db = DataBase(destination_db) self.use_cols = use_cols self.data_path = data_path self.from_outer = from_outer self.time_to_proc = time_to_proc self.mode = mode self.time_col = time_col self.cols_to_average = cols_to_average self.group_by = group_by self.avg_window = avg_window self.entities_to_use = entities_to_use self.decode_UNIX_time = decode_UNIX_time self.prepare_time = prepare_time self.trig_prepare_time = trig_prepare_time self.data = pd.DataFrame()
class CoreSeparated: def __init__(self, subject, targets, time_cols, source_db, result_db, n_predict, volume, interval): self.subject = subject self.targets = targets self.time_cols = time_cols self.n_predict = n_predict self.volume = volume self.interval = interval self.source_db = DataBase(source_db) self.result_db = DataBase(result_db) self.modelers = {} self.models_path = './models/separated/model_' + self.subject + '_' def fit_core(self, first=False): data = self.source_db.read(self.subject + '_source').get_data() print(data.iloc[:, :]) for target in self.targets: modeler = Modeler(model_path=self.models_path + target + '.txt', n_predict=self.n_predict, volume=self.volume, scaler='std', drop_null=True) print(data[target]) print(data[self.time_cols + ['Время']]) X, y = modeler.prepare_data(X=data[self.time_cols + ['Время']], y=data[target], predict_phase=True) n_folds = 6 if len(X) <= self.n_predict + n_folds: print( 'Warning: Model wasn`t trained! Not enought clean data ' + target) continue # Нужно для понимания, когда данные пришли time_col = pd.to_datetime(X['Время']) first_start = os.path.isfile(self.models_path + target + '.txt') X.drop(['Время'], axis=1, inplace=True) modeler.fit(X.iloc[:-self.n_predict, :], y[:-self.n_predict]) self.modelers[self.subject + '_' + target] = modeler if not first_start: predictions, lower, upper = modeler.predict(X, interval=True) y_anom_search = y[:-self.n_predict].copy() anomalies = CoreSeparated.find_anomalies( y_anom_search, lower[:-self.n_predict], upper[:-self.n_predict]) anom_dummy = np.full((self.n_predict, ), np.NaN) anomalies_res = np.concatenate([anomalies, anom_dummy]) y_res = np.concatenate([y_anom_search, anom_dummy]) # Интерполяция времени на предсказанные периоды time_dummy = pd.date_range( start=time_col.iloc[-self.n_predict - 1], periods=self.n_predict + 1, closed='right', freq=self.interval) time_col = pd.concat( [time_col.iloc[:-self.n_predict], pd.Series(time_dummy)], axis=0).astype(str) # Добавим первые значения показателя без предсказаний time_first = data['Время'][:self.volume] y_first = data[target][:self.volume] dummy_first = np.array( [None for counter in range(self.volume)]) time_col = pd.concat([pd.Series(time_first), time_col], axis=0).astype(str) y_res = np.concatenate([y_first, y_res], axis=0) predictions = np.concatenate([y_first, predictions], axis=0) lower = np.concatenate([dummy_first, lower], axis=0) upper = np.concatenate([dummy_first, upper], axis=0) anomalies_res = np.concatenate([dummy_first, anomalies_res], axis=0) ## df_result = pd.DataFrame(np.vstack([ time_col, y_res, predictions, lower, upper, anomalies_res ]).T, columns=[ 'time', 'actual', 'predictions', 'lower', 'upper', 'anomalies' ]) self.result_db.write_from_df(Table( 'Analitics_' + 'sep_' + self.subject + '_' + target, df_result), method='replace') @staticmethod def find_anomalies(y_true, lower, upper): anomalies = np.array([np.NaN] * len(y_true)) anomalies[y_true < lower] = y_true[y_true < lower] anomalies[y_true > upper] = y_true[y_true > upper] return anomalies def load_models(self): """ ! load None if model doesn`t exists """ for target in self.targets: if os.path.isfile(self.models_path + target + '.txt'): modeler = Modeler(model_path=self.models_path + target + '.txt', n_predict=self.n_predict, volume=self.volume, scaler='std', drop_null=True) modeler.load_model_params() else: #if model doesn`t exists yet modeler = None self.modelers[self.subject + '_' + target] = modeler def make_predictions(self): for target in self.targets: modeler = self.modelers[self.subject + '_' + target] if modeler is None: #model doesn`t exists yet continue volume = modeler.get_volume() batch_volume = 1 data = self.source_db.read(self.subject + '_source', volume=volume + self.n_predict).get_data() X, y = modeler.prepare_data(X=data[self.time_cols + ['Время']], y=data[target], predict_phase=True, drop_null=False) time = pd.Series(pd.to_datetime(X['Время'])) start_time = time.iloc[-self.n_predict - batch_volume] time = pd.Series( pd.date_range(start=start_time, periods=self.n_predict + batch_volume, closed='left', freq=self.interval)) time = time.astype(str) X.drop(['Время'], axis=1, inplace=True) predictions, lower, upper = modeler.predict(X, interval=True) anomalies = CoreSeparated.find_anomalies(y, lower, upper) #send data to outer messager actual = data[target][-batch_volume:] actual_dummy = np.array( [np.NaN for i in range(len(predictions) - batch_volume)]) actual = np.concatenate([actual, actual_dummy]) df_result = pd.DataFrame(np.vstack( [time, actual, predictions, lower, upper, anomalies]).T, columns=[ 'time', 'actual', 'predictions', 'lower', 'upper', 'anomalies' ]) self.result_db.write_from_df(Table( 'Analitics_' + 'sep_' + self.subject + '_' + target, df_result), method='replace last')
class DataPreparator: def __init__(self, destination_db, use_cols, data_path, time_col, cols_to_average, entities_to_use, group_by, avg_window='300S', decode_UNIX_time=False, prepare_time=False, trig_prepare_time=False, mode='load', from_outer=False, time_to_proc=None): self.destination_db = DataBase(destination_db) self.use_cols = use_cols self.data_path = data_path self.from_outer = from_outer self.time_to_proc = time_to_proc self.mode = mode self.time_col = time_col self.cols_to_average = cols_to_average self.group_by = group_by self.avg_window = avg_window self.entities_to_use = entities_to_use self.decode_UNIX_time = decode_UNIX_time self.prepare_time = prepare_time self.trig_prepare_time = trig_prepare_time self.data = pd.DataFrame() def upload_data(self): if self.from_outer: #Здесь запуск скрипта загрузки данных извне (например чтение из Kafka) #df = pd.read_csv('./Source data/'+self.data_path, sep=',', usecols=self.use_cols) kafka_conn = KafkaConnector() df = kafka_conn.load_data() else: df = self.destination_db.read(self.data_path).get_data() if len(df) == 0: df = pd.DataFrame([], columns=self.use_cols) #self.data = df #return if self.decode_UNIX_time: # приводим время к читаемому формату: df[self.time_col] = df[self.time_col].apply(lambda x: datetime.fromtimestamp(x)) else: df[self.time_col] = pd.to_datetime(df[self.time_col]) if self.time_to_proc and self.mode == 'operate': delta = timedelta(seconds=int(self.avg_window[:-1])) thr_low = self.time_to_proc-delta <= df[self.time_col] thr_up = self.time_to_proc > df[self.time_col] print('CURRENT TIME') print(self.time_to_proc) df = df[thr_low & thr_up] # Здесь удалить все то, что было раньше из источника, если необходимо self.data = df def preprocess_data(self): """ df - исходные данные (только с нужными колонками) time_col - колонка времени cols_to_average - Колонки, для которых надо посчитать среднее значение по периоду (для остальных - сумма по периоду) entities_to_use - Значения колонки источников, необходимые для рассмотрения group_by - Колонка с источниками avg_window - Период, по который нужно сужать данные decode_UNIX_time - НАдо ли декодировать время из формата времени UNIX prepare_time - Нужно ли добавлять признаки час, день недели, выходной день trig_prepare_time - Кодировка времени по sin/cos return - dict - {источник:данные} - list - название колонок временных признаков """ self.upload_data() df = self.data # Добавим колонку с "1", чтобы после усреднения знать количество соединений за каждый период времени df['ConnectionCount'] = np.ones((len(df), 1), dtype=int) # Словарь для записи результатов result = {} for entity in self.entities_to_use: print(df[df[self.group_by] == entity]) df_res = df[df[self.group_by] == entity] print(self.group_by) print(entity) print(df) if (len(df_res) == 0): if not (self.time_to_proc): #self.time_to_proc = pd.to_datetime('2018-08-31 15:30:00') raise Exception('Start up error: ', 'start system without data in buffer and without specifying time_to_proc') a = np.zeros(shape=(1, len(df.columns)), dtype=int) df_res = pd.DataFrame(a, columns=df_res.columns, index=[0]) df_res.loc[0, self.time_col] = pd.to_datetime(self.time_to_proc) - timedelta(seconds= int(self.avg_window[:-1])) df_res.drop([self.group_by], axis=1, inplace=True) print(df_res[[self.time_col]]) df_res = df_res.resample(self.avg_window, on=self.time_col).sum() df_res[self.time_col] = df_res.index df_res.reset_index(drop=True, inplace=True) if self.prepare_time or self.trig_prepare_time: # datetime features df_res["hour"] = df_res[self.time_col].apply(lambda x: x.hour) df_res["weekday"] = df_res[self.time_col].apply(lambda x: x.weekday()) df_res['is_weekend'] = df_res[self.time_col].apply(lambda x: x.weekday()).isin([5, 6]) * 1 if self.trig_prepare_time: hours_in_day = 24 weekdays_in_week = 7 df_res['sin_hour'] = np.sin(2 * np.pi * df_res["hour"] / hours_in_day) df_res['cos_hour'] = np.cos(2 * np.pi * df_res["hour"] / hours_in_day) df_res['sin_weekday'] = np.sin(2 * np.pi * df_res["weekday"] / weekdays_in_week) df_res['cos_weekday'] = np.cos(2 * np.pi * df_res["weekday"] / weekdays_in_week) df_res.drop(['hour'], axis=1, inplace=True) df_res.drop(['weekday'], axis=1, inplace=True) for col in self.cols_to_average: print('!!!!!!!') print(df_res['ConnectionCount']) df_res[col] = (df_res[col] / df_res['ConnectionCount']) df_res[[col]] = df_res[[col]].fillna(value=0) result[entity] = df_res if self.trig_prepare_time: result_time_cols = [self.time_col, 'sin_hour', 'cos_hour', 'sin_weekday', 'cos_weekday', 'is_weekend'] elif self.prepare_time: result_time_cols = [self.time_col, 'hour', 'weekday', 'is_weekend'] else: result_time_cols = [self.time_col] return result, result_time_cols def load_data_to_db(self, result_dict): for subject in list(result_dict.keys()): print(result_dict[subject]) self.destination_db.write_from_df(Table(subject+'_source', result_dict[subject]), method='append')
class CoreClust: def __init__(self, subject, targets, time_cols, source_db, result_db): self.subject = subject self.targets = targets self.time_cols = time_cols self.source_db = DataBase(source_db) self.result_db = DataBase(result_db) self.scaler = StandardScaler() self.modelers = {} self.model_path = './models/clust/' def fit_core(self, first=False): data = self.source_db.read(self.subject + '_source').get_data() time_col = data['Время'] data = data[self.targets] data_scaled = self.scaler.fit_transform(data) isolation_forest = IsolationForest(n_estimators=160, contamination=0.05, max_features=1.0, bootstrap=True, random_state=42) isolation_forest.fit(data_scaled) self.modelers['IsolationForest'] = isolation_forest self.save_models() if first: isolation_outliers = isolation_forest.predict(data_scaled) isolation_outliers = np.array( [1 if label == -1 else 0 for label in isolation_outliers]) df_result = pd.DataFrame(np.vstack([time_col, isolation_outliers]).T, columns=['time', 'outliers']) self.result_db.write_from_df(Table( 'Analitics_' + 'clust_' + self.subject, df_result), method='replace') def load_models(self): if isfile(self.model_path + 'model_' + self.subject + '_IsolationForest' + '.txt'): with open( self.model_path + 'model_' + self.subject + '_IsolationForest' + '.txt', 'rb') as file: self.modelers['IsolationForest'] = pickle.load(file) with open( self.model_path + 'model_' + self.subject + '_IsolationForest' + '_params.txt', 'rb') as file: params = pickle.load(file) self.scaler = params['scaler'] def save_models(self): with open( self.model_path + 'model_' + self.subject + '_IsolationForest' + '.txt', 'wb') as file: pickle.dump(self.modelers['IsolationForest'], file) params = {} params['scaler'] = self.scaler with open( self.model_path + 'model_' + self.subject + '_IsolationForest' + '_params.txt', 'wb') as file: pickle.dump(params, file) def make_predictions(self): isolation_forest = self.modelers['IsolationForest'] data = self.source_db.read(self.subject + '_source', volume=1).get_data() time_col = data['Время'] data = data[self.targets] data_scaled = self.scaler.transform(data) isolation_outliers = isolation_forest.predict(data_scaled) isolation_outliers = np.array( [1 if label == -1 else 0 for label in isolation_outliers]) df_result = pd.DataFrame(np.vstack([time_col, isolation_outliers]).T, columns=['time', 'outliers']) self.result_db.write_from_df(Table( 'Analitics_' + 'clust_' + self.subject, df_result), method='append')