def initial_train(self): log.info('Performing initial training of the model.') client = mongoUtil.get_client() db = client.otc train_data = db.train_data if_trained = False log.info('Established connection to Mongo Database.') old_data_x_frame = pd.DataFrame(list(train_data.find())) if not old_data_x_frame.empty: if_trained = True log.info('Initial training sequence started.') old_data_y_frame = old_data_x_frame['Break Owner'].astype(str) old_data_x_frame.drop('_id', axis=1, inplace=True) old_data_x_frame.drop('timestamp', axis=1, inplace=True) old_data_x_frame.drop('Break Owner', axis=1, inplace=True) old_data_x_frame.drop('Break Reason', axis=1, inplace=True) self.training_data = old_data_x_frame train_x = self.one_hot(old_data_x_frame) self.training_frame = train_x.columns.values train_y = list(old_data_y_frame) self.current_report = self.execute_train_report(train_x, train_y) else: log.info( 'Cannot find training data in Mongo Database. Abort initial training sequence.' ) return if_trained
def load_purging(): if request.method == 'POST': mongoUtil.purge_model(session['start'], session['end']) #This line trains the model on the new dataset without the purged data. model.initial_train() log.info('Model training on purged model begun.') client = mongoUtil.get_client() db = client.otc train_data = db.train_data if train_data.count() == 0: model.current_report = None model.reset_model() log.info('Model training on purged model completed.') return redirect(url_for('purge_result')) return render_template('loading_purge.html')
def purge_model(): try: log.info('Model data is being purged.') #These lines get the training data from the database client = mongoUtil.get_client() db = client.otc train_data = db.train_data if request.method == 'POST': log.info('POST: waiting for purge time frame.') start_date = request.values["Start Date"] end_date = request.values["End Date"] #This handles the incorrect data error if (end_date < start_date) or ((start_date == "") or (end_date == "")): errorString = 'Invalid dates detected. Please select both start and end date.' log.error(errorString) session['errorOc'] = True session['errorInfo'] = errorString return redirect(url_for('purge_result')) else: #These lines parse the date strings to ISO date format start_date = datetime.strptime(start_date, '%Y-%m-%d') end_date = datetime.strptime(end_date, '%Y-%m-%d') session['start'] = start_date session['end'] = end_date log.info('Purge start date: ' + str(start_date) + '. Purge end date: ' + str(end_date)) #These checks if the data is all purged. If it is we do not need to train the model again if train_data.count( {"timestamp": { '$gte': start_date, '$lte': end_date }}) == 0: return redirect(url_for('purge_result')) if train_data.count() != 0: return redirect(url_for('load_purging')) return render_template('modify_model.html') except: errorString = 'Unknown error occurred.' log.error(errorString)
def flag(self, df): df_new = df df_new['FLAG'] = None client = mongoUtil.get_client() db = client.otc train_data = pd.DataFrame(list(db.train_data.find({}))) d = {} for col in train_data.columns: d[col] = dict(enumerate(train_data[col].unique().flatten(), 1)) # starttime = time() for index, row in df_new.iterrows(): for col in train_data.columns.values: if (row[col]) not in set(d[col].values()) and col not in \ ['NOTIONAL', 'NOTIONAL_2', 'STRIKE_PRICE', 'DIFF_AGE', '_id', 'timestamp']: df_new.loc[index, 'FLAG'] = 'x' break return df_new
def re_train(self, filename): log.info('Training started.') start = time.time() # Grab new data from data folder csv_path = 'train_data/' + str(filename) df = pd.read_csv(csv_path) new_data_x_frame = df.drop('Break Owner', axis=1).drop('Break Reason', axis=1) new_data_y_frame = df['Break Owner'].astype(str) new_data_x = new_data_x_frame.values new_data_y = df[['Break Owner']].values # Grab the original training data from the database log.info('Establishing connection to Mongo Database.') client = mongoUtil.get_client() db = client.otc train_data = db.train_data log.info('Connection established.') old_data_x_frame = pd.DataFrame(list(train_data.find())) # print old_data_x.columns classification_report_var = None return_json = None if not old_data_x_frame.empty: # old_data_x = self.one_hot(old_data_x_frame) log.info('Previous training data was found.') old_data_y_frame = old_data_x_frame['Break Owner'].astype(str) old_data_x_frame.drop('_id', axis=1, inplace=True) old_data_x_frame.drop('timestamp', axis=1, inplace=True) old_data_x_frame.drop('Break Owner', axis=1, inplace=True) old_data_x_frame.drop('Break Reason', axis=1, inplace=True) a = new_data_x_frame.columns b = old_data_x_frame.columns log.info('Performing data clean up.') # This set is the columns that the unanalyzed data doesn't have new_set = set(b) - set(a) # This set is the columns that the training data doesn't have new_set_2 = set(a) - set(b) # Remove the columns that the training data doesn't have for col in new_set_2: new_data_x_frame = new_data_x_frame.drop(col, axis=1) # Adding columns of 0 for the columns the unanalyzed data doesn't have but the training data does for col in new_set: new_data_x_frame[col] = 0 train_x = [old_data_x_frame, new_data_x_frame] log.info('Merging previous data with new data.') # Merge them together train_x = pd.concat(train_x, ignore_index=True) # Grab the original target data from the database train_y = [old_data_y_frame, new_data_y_frame] # Merge them together train_y = pd.concat(train_y, ignore_index=True) train_y = list(train_y) self.training_data = train_x train_x = self.one_hot(train_x) self.training_frame = train_x.columns.values self.current_report = self.execute_train_report(train_x, train_y) else: log.info('No prevous training data found.') self.training_data = new_data_x_frame new_data_x_frame = self.one_hot(new_data_x_frame) self.training_frame = new_data_x_frame.columns.values new_data_x = new_data_x_frame.values self.current_report = self.execute_train_report( new_data_x, new_data_y, test_size=config.getfloat('Misc', 'TEST_SIZE_2'), random_state=config.getint('Misc', 'RSTATE_1')) end = time.time() log.info('Total training time: ' + str(end - start)) return self.current_report