def do_first_job(file_location: str): experiments = get_experiments(file_location) did_job = False idx = 0 for cmd, state in experiments.items(): idx = idx + 1 if state == EXPERIMENT_NOT_DONE: # Do this job experiments[cmd] = EXPERIMENT_BUSY upload_experiments(file_location, experiments) did_job = True exit_code = do_job(cmd + (' -g 16' if io.get('use_gpus') else ' -g 0')) if exit_code != 0: error('An error occurred while executing command', cmd, 'giving exit code', exit_code) try_notify('A command failed') upload_experiments(file_location, experiments, is_error=True, error_code=exit_code) sys.exit(1) else: experiments[cmd] = EXPERIMENT_DONE logline('Done with job', cmd) try_notify('Done with job ' + str(idx)) upload_experiments(file_location, experiments) break return did_job
def get_features(): file = get_pd_file() logline('Length before filtering is', len(file)) f = filter_users(file) logline('Length after filtering is', len(f)) rows = len(f) f = group_pd_file(f) gen_features(f, rows)
def get_pd_file() -> pd.DataFrame: logline('Opening file') dataset_name = get_dataset_name() return pd.read_hdf(io.get('input_file'), dataset_name, start=0, stop=calc_rows_amount(), chunksize=1000)
def upload_experiments(file_location: str, experiments: Dict[str, int], is_error=False, error_code=0): with open(file_location, 'w+') as experiments_file: if is_error: logline('Set state to error state with code', error_code) experiments["error_code"] = error_code else: logline('Updated experiments file') experiments_file.write(json.dumps(experiments))
def main(): if not io.run: return start_time = time.time() logline("Gathering features for", str(io.get('dataset_percentage')) + "% of rows", "using a batch size of", BATCH_SIZE) get_features() # get_features_iter() logline('Total runtime is', Timer.stringify_time(Timer.format_time(time.time() - start_time))) sys.exit()
def split_dataframe(f: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]: # Try to get close to the target split training_set = list() test_set = list() index = 10 logline('Splitting dataframes') grouped = f.groupby(np.arange(len(f)) // (len(f) / 10)) for g, dataframe in grouped: if index <= TRAINING_SET_PERCENTAGE: training_set.append(dataframe) else: test_set.append(dataframe) index += 10 # noinspection PyTypeChecker return pd.concat(training_set), pd.concat(test_set)
def output_data(users_list: List[Dict[str, Union[str, Dict[str, List[List[float]]]]]]): if io.get('output_file') == 'stdout': logline('Outputting to stdout') sys.stdout.write(json.dumps(users_list)) else: logline('Outputting data to file', io.get('output_file')) output = open(io.get('output_file'), 'wb') try: pickle.dump(users_list, output, protocol=4) except: try: logline("Using JSON instead") output.write(json.dumps(users_list)) except: error('Outputting to console instead') print(json.dumps(users_list)) raise raise logline('Done outputting data to file')
def filter_users(f: pd.DataFrame) -> pd.DataFrame: logline('Generating anonymous users filter') anonymous_users_filter = ~(f['source_user'].str.contains('ANONYMOUS') & f['source_user'].str.contains('LOGON')) if io.get('users_only'): debug('Skipping all computer users') logline('Generating computer users filter') computer_users_filter = ~(f['source_user'].str.startswith('C') & f['source_user'].str.endswith('$')) full_filter = anonymous_users_filter & computer_users_filter else: full_filter = anonymous_users_filter logline('Applying filters') return f[full_filter]
def gen_features(f: pd.DataFrame, row_amount: int): users_list = list() logline('Calculating amount of groups...') users = len(f) logline( 'There are', users, 'users and', row_amount, 'rows matching your filter type', 'no computer users or anonymous users' if io.get('users_only') else 'no anonymous users') rows = 0 max_users = users if not DO_ROWS_PERCENTAGE: max_users = int(math.ceil(users * 0.01 * io.get('dataset_percentage'))) logline('Max amount of users is', max_users) logline('Setting timer for', int(math.ceil(row_amount * 0.01 * io.get('dataset_percentage'))), 'rows') timer = Timer( int(math.ceil(row_amount * 0.01 * io.get('dataset_percentage')))) logline('Creating iterator') dataset_iterator = DFIterator(f) next_report = REPORT_SIZE if not SKIP_MAIN: try: # Create groups of approx 1000 users big if io.get('cpus') == 1: logline('Only using a single CPU') logline('Starting feature generation') for name, group in f: completed_result, group_len = strip_group_length( gen_features_for_user((name, group))) timer.add_to_current(group_len) rows += group_len if completed_result is not None: users_list.append(completed_result) if rows > next_report == 0 or REPORT_EVERY_USER: next_report = next_report + REPORT_SIZE logline('At row ', str(rows), '/~', str(row_amount), ' - ETA is: ' + timer.get_eta(), spaces_between=False) logline('At user ', len(users_list), '/~', max_users, spaces_between=False) if len(users_list) >= max_users: break else: logline('Using', io.get('cpus'), 'cpus') for i in range( round(math.ceil(max_users / PROCESSING_GROUP_SIZE))): dataset_iterator.set_max((i + 1) * PROCESSING_GROUP_SIZE) if i == 0: logline('Starting feature generation') with multiprocessing.Pool(io.get('cpus')) as p: for completed_result in p.imap_unordered( gen_features_for_user, dataset_iterator, chunksize=100): completed_result, group_len = strip_group_length( completed_result) timer.add_to_current(group_len) rows += group_len if completed_result is not None: users_list.append(completed_result) if rows > next_report or REPORT_EVERY_USER: next_report = next_report + REPORT_SIZE logline('At row ', str(rows), '/~', str(row_amount), ' - ETA is: ' + timer.get_eta(), spaces_between=False) logline('At user', len(users_list), '/~', max_users, spaces_between=False) except KeyboardInterrupt: logline('User cancelled execution, wrapping up') debug('Cancelled early at', len(users_list), 'instead of', users) debug('You skipped a total of', users - len(users_list), 'users, or', 100 - ((len(users_list) / users) * 100), '%') except Exception: error('An error occurred during execution', traceback.format_exc()) debug('Salvaging all remaining users') finally: debug('Runtime is', timer.report_total_time()) logline("Did a total of", len(users_list), "users") logline('Done gathering data') logline('Closing file...') output_data(users_list) else: debug('SKIPPING MAIN, DO NOT ENABLE IN PRODUCTION') logline('Closing file') output_data([])
def extract_features(rows): users_list = list() users = len(rows) rows_amount = 0 logline( 'There are', users, 'users and', len(rows), 'rows matching your filter type', 'no computer users or anonymous users' if io.get('users_only') else 'no anonymous users') rows_max = get_dict_inner_length(rows) logline('Setting timer for', rows_max, 'rows') timer = Timer(rows_max) try: for name, group in rows.items(): completed_result, group_len = strip_group_length( gen_features_for_user((name, group))) timer.add_to_current(group_len) rows_amount += group_len if completed_result is not None: users_list.append(completed_result) if rows_amount > next_report == 0 or REPORT_EVERY_USER: next_report = next_report + REPORT_SIZE logline('At row ', str(rows_amount), '/~', str(row_amount), ' - ETA is: ' + timer.get_eta(), spaces_between=False) logline('At user ', len(users_list), '/~', max_users, spaces_between=False) if len(users_list) >= max_users: break except KeyboardInterrupt: logline('User cancelled execution, wrapping up') debug('Cancelled early at', len(users_list), 'instead of', users) debug('You skipped a total of', users - len(users_list), 'users, or', 100 - ((len(users_list) / users) * 100), '%') except Exception: error('An error occurred during execution', traceback.format_exc()) debug('Salvaging all remaining users') finally: debug('Runtime is', timer.report_total_time()) logline("Did a total of", len(users_list), "users") logline('Done gathering data') logline('Closing file...') output_data(users_list)
def filter_users(f: pd.DataFrame) -> pd.DataFrame: logline('Generating anonymous users filter') anonymous_users_filter = ~(f['source_user'].str.contains('ANONYMOUS') & f['source_user'].str.contains('LOGON')) if io.get('users_only'): debug('Skipping all computer users') logline('Generating computer users filter') computer_users_filter = ~(f['source_user'].str.startswith('C') & f['source_user'].str.contains('$')) logline('Filtering out', len(list(filter(lambda x: x, ~computer_users_filter))), 'computer users') full_filter = anonymous_users_filter & computer_users_filter else: full_filter = anonymous_users_filter logline('Filtering out', len(list(filter(lambda x: x, ~anonymous_users_filter))), 'anonymous users') logline('Filtering out a total of', len(list(filter(lambda x: x, ~full_filter))), 'rows') logline('Applying filters') return f[full_filter]
def group_pd_file(f: pd.DataFrame) -> pd.DataFrame: logline('Grouping users in file') grouped = group_df(f) logline('Done grouping users') return grouped
def main(): if not io.run: return state_file = io.get('state_file') input_file = io.get('input_file') output_file = io.get('output_file') dataset_file = io.get('dataset_file') logline('Loading dataset file...') f = pd.read_hdf(dataset_file, get_dataset_name(), start=0, stop=calc_rows_amount()) logline('Filtering users') f = filter_users(f) logline('Grouping users') f = group_df(f) if state_file is not None: initial_state = get_state(state_file) logline('Waiting for state to reach different value, currently at ' + str(initial_state) + '...') while get_state(state_file) == initial_state: time.sleep(60) logline('State file has switched to ' + str(get_state(state_file)) + ', continuing execution') logline('Loading anomalies') anomalies = read_anomalies(input_file) anomaly_rows_list = dict() users = len(f) max_users = users if DO_ROWS_PERCENTAGE: max_users = math.ceil(users * 0.01 * io.get('dataset_percentage')) timer = Timer(math.ceil(len(f) * 0.01 * io.get('dataset_percentage'))) for name, group in f: user_name = group.iloc[0].get('source_user').split('@')[0] anomaly_collection = anomalies.get(user_name) if anomaly_collection is not None: # Print those rows user_anomalies = list() for anomaly in anomaly_collection: anomaly_dict = { "start": anomaly["start"], "end": anomaly["end"], "lines": listify_df(group.iloc[anomaly["start"]:anomaly["end"]]), "final_features": translate_feature_arr(anomaly["final_row_features"]), "predicted": anomaly["predicted"], "actual": anomaly["actual"], "loss": anomaly["loss"] } user_anomalies.append(anomaly_dict) anomaly_rows_list[user_name] = user_anomalies timer.add_to_current(1) if timer.current % REPORT_SIZE == 0: logline('ETA is ' + timer.get_eta()) if timer.current >= max_users: break debug('Runtime is', timer.report_total_time()) logline('Generating concatenated results') if output_file == 'stdout': logline("Outputting results to stdout\n\n\n") logline('Final value is', anomaly_rows_list) logline(json.dumps(anomaly_rows_list)) else: logline('Outputting results to', output_file) with open(output_file, 'w') as out_file: out_file.write(json.dumps(anomaly_rows_list)) logline('Output results to', output_file) if REMOVE_INPUT_FILE: os.remove(input_file) logline('Removed encoded file') else: logline('Not Removing encoded file') logline('Done, closing files and stuff')
if timer.current >= max_users: break debug('Runtime is', timer.report_total_time()) logline('Generating concatenated results') if output_file == 'stdout': logline("Outputting results to stdout\n\n\n") logline('Final value is', anomaly_rows_list) logline(json.dumps(anomaly_rows_list)) else: logline('Outputting results to', output_file) with open(output_file, 'w') as out_file: out_file.write(json.dumps(anomaly_rows_list)) logline('Output results to', output_file) if REMOVE_INPUT_FILE: os.remove(input_file) logline('Removed encoded file') else: logline('Not Removing encoded file') logline('Done, closing files and stuff') if __name__ == '__main__': start_time = time.time() main() logline('Total runtime is', Timer.stringify_time(Timer.format_time(time.time() - start_time)))
def main(): experiments_file_location = io.get('experiments_file') if experiments_file_location is None: logline('Experiment file is not specified, please do so') sys.exit(2) logline('Starting') try: logline('Starting jobs') while do_first_job(experiments_file_location): logline('Did job') logline('Completed successfully!') except KeyboardInterrupt: logline('Cancelling run due to user interrupt') sys.exit(130) logline('Done')
def ensure_trailing_slash(folder_name: str) -> str: """Makes sure a string ends with a slash""" if not folder_name.endswith('/'): return folder_name + '/' return folder_name def ensure_folder(folder_name: str): """Checks if a folder exists and if it doesn't, makes it""" if not os.path.exists(folder_name): os.makedirs(folder_name) def gen_folders(): """Generates all folders needed for the process""" root_folder = ensure_trailing_slash(io.get('folder')) ensure_folder(root_folder) ensure_folder(root_folder + 'logs/') ensure_folder(root_folder + 'plots/') def main(): gen_folders() if __name__ == "__main__": logline('Starting folder generation') main() logline('Done generating folders')