async def check_for_alert_match(self): urls = [ 'https://twitter.com/CFTC', 'https://twitter.com/sec_enforcement?lang=en', 'https://twitter.com/ushouserep?lang=en' ] strip_texts = None with open('ignore-lines.json', 'r') as f: strip_texts = json.load(f) Log.d('checking {} sources, ignoring {} lines..', len(urls), len(strip_texts)) patterns = [ r'.{,200}bitcoin.{,200}', r'.{,200}crypto.{,200}', r'.{,200}virtual currency.{,200}', ] for url in urls: async with aiohttp.ClientSession() as session: html_text = await self.__fetch(session, url) text = StringExpert.strip_tags(html_text) text = html.unescape(text) for strip_text in strip_texts: text = text.replace(strip_text, '') for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) if match is not None: matched_line = match.group() warning = 'Found pattern "{}" at url "{}" in line: {}'.format(pattern, url, matched_line) Log.w(warning) return True return False
def process_h5(self): with pd.HDFStore(self.h5_filepath, mode='r') as h5: for jobuid in h5: is_first_encounter = jobuid not in self.job_frames if is_first_encounter == True: self.job_frames[jobuid] = pd.read_hdf( h5, jobuid, start=0, stop=1) # fetch first row to get the first index/epoch job_df = self.job_frames[jobuid] latest_epoch = job_df.index.values[ -1] # will ensure we don't 'miss' any rows in case the handle count jumps more than once where_clause = 'index > {} {}'.format(latest_epoch, self.contraints_clause) new_df = pd.read_hdf(h5, jobuid, where=where_clause) if new_df.empty: Log.w('dataset was empty for key {} and index > {}', jobuid, latest_epoch) else: assert new_df.index.values[0] > latest_epoch new_first_index = 0 if is_first_encounter == True else len( job_df) joined = pd.concat([job_df, new_df]) self.job_frames[jobuid] = joined if len(joined) > 100000: Log.w( 'holding a dataset of significant length ({} rows, {:.1f}mb): {}', len(joined), joined.memory_usage().sum() / 1_000_000, jobuid) assert joined.shape[0] == len(job_df) + len(new_df) self.ensure_strictly_increasing_index( joined) # TODO: remove once this is confirmed self.row_handler(jobuid, joined, new_first_index)
def __init__(self, h5_filepath, version): warnings.simplefilter('ignore', NaturalNameWarning) h5_inputfile = Path(h5_filepath) output_dirpath = AppConfig.setting('PREDICTOR_DATA_DIRPATH') self.h5_out_filepath = os.path.join(output_dirpath, h5_inputfile.name) h5_out_file = Path(self.h5_out_filepath) if h5_out_file.exists(): Log.i('overwrite file?: {}', h5_out_file) if not OsExpert.prompt_confirm('File already exists, overwrite? {}'.format(h5_out_file)): Log.d('user aborted, exiting') exit() Log.w('removing file: {}', h5_out_file) os.remove(self.h5_out_filepath) self.predictors_map = {} base_filepath = output_dirpath with pd.HDFStore(h5_filepath, mode='r') as h5: keys = h5.keys() Log.i('h5 input keys: {}', keys) assert len(keys) == 1, 'harcoded restriction on single key was violated' for key in keys: Log.i('row count for {}: {}', key, h5.get_storer(key).nrows) self.predictors_map[key] = [ EnsemblePredictor(min_predict_generator_size=2000, max_train_size=5000) ] self.h5_watcher = H5FileWatcher(h5_filepath, self.handle_job_epoch, {'is_simulated': 0})
def filter_simulated_observations(self, df): filtered_df = df[df['is_simulated'] != 1] dropped = df[~df.index.isin(filtered_df.index)] if len(dropped) > 0: Log.w('filtered out {} simulated frames', len(dropped)) else: Log.d('no simulated frames found to filter out') return filtered_df
def feed_jobs_forever(self, job_changed_handler): assert job_changed_handler is not None sleep_seconds = self.sleep_seconds transaction_min_timestamp = self.transaction_min_timestamp start_transaction_min_timestamp = transaction_min_timestamp data_dirpath = self.data_dirpath start_time = time.time() Log.i( 'processing transactions, sleep interval {}s, starting from epoch {} ({})', sleep_seconds, transaction_min_timestamp, StringExpert.format_timestamp(transaction_min_timestamp)) to_fetch_count = self.db.transaction_count(transaction_min_timestamp) Log.d('transaction count since {} ({}): {}', transaction_min_timestamp, StringExpert.format_timestamp(transaction_min_timestamp), to_fetch_count) pd.set_option('io.hdf.default_format', 'table') hdf5_filename = '{}_{}_{}.h5'.format( self.version.major, self.version.minor, datetime.fromtimestamp(start_time).strftime('%Y%m%d_%H%M%S')) hdf5_filepath = path.join(data_dirpath, hdf5_filename) Log.i('hdf5 output filepath is: \n{}', hdf5_filepath) set_size = 1000 fetch_count = 0 plot_time = time.time() is_realtime = False while True: try: next_transaction_min_timestamp = self.process_transaction_subset( transaction_min_timestamp, set_size, hdf5_filepath, job_changed_handler, is_realtime) if next_transaction_min_timestamp is None: Log.d('nothing to process, waiting..') is_realtime = True # TODO: empty polling perhaps not the best indicator of switch to realtime time.sleep(sleep_seconds) else: assert next_transaction_min_timestamp > transaction_min_timestamp, 'next minimum timestamp was not greater than the current timestamp' transaction_min_timestamp = next_transaction_min_timestamp fetch_count += set_size percentage = 100 * fetch_count / to_fetch_count current_time = time.time() Log.d( 'processed {}/{}, {}%, spent {} on the period {} ({}) to {} ({})', fetch_count, to_fetch_count, int(percentage), Timespan.from_seconds(int(current_time - start_time)).as_string(), StringExpert.format_timestamp( start_transaction_min_timestamp), start_transaction_min_timestamp, StringExpert.format_timestamp( transaction_min_timestamp), transaction_min_timestamp) except Exception as e: raise Exception( 'Failed to process nonparsed api responses') from e Log.w('all {} rows read, but should loop forever', row_count)
def __predict(self, df): max_prediction_count = 100 if self.predict_count >= max_prediction_count: Log.w('too many predictions {} reached, exiting', self.predict_count) exit() assert len(df) == 1 X_all, y_all = self.frame_to_ml_inputs(df) predict_row = X_all.iloc[0] Log.d('predicting based on {} values:\n{}', len(predict_row.values), predict_row.squeeze().sort_index()) prediction_response = self.predictor.predict(predict_row.values) prediction = self.sagemaker_response_highest_score_label(prediction_response) self.predict_count += 1 return prediction
def handle_job_epoch(self, jobuid, df, start_index): Log.d('handling block starting at index {} for key: {}', start_index, jobuid) try: df = self.filter_simulated_observations(df) if len(df) == 0: Log.w('no rows to process') return handle_start_time = time.time() new_df = df.iloc[start_index:]#.copy() if new_df.empty: Log.w('nothing to process (zero rows) starting from index {}', start_index) return indices = new_df.index.values #list(range(start_index, len(df))) index_count = len(new_df) predictors = self.predictors_map[jobuid] Log.d('feeding indices, epochs [{}..{}] ({} rows) to {} predictors for key: {}', indices[0], indices[-1], len(indices), len(predictors), jobuid) processed_count = 0 prediction_count = 0 for epoch, row in new_df.iterrows(): for predictor in predictors: try: prediction = predictor.process(epoch, df) if prediction is not None: col_name = predictor.prefix new_df.at[epoch, col_name] = prediction prediction_count += 1 self.print_acc(new_df) Log.d('prediction: {}', prediction) Log.d('prediction {} received, sleeping..', prediction_count) time.sleep(3) except Exception as predict_error: raise Exception('Failed to feed epoch {} to predictor {}'.format(epoch, type(predictor).__name__)) processed_count += 1 if processed_count % int(index_count/10) == 0: percentage_processed = 100 * processed_count / index_count Log.d('..processed {}/{} {:.1f}%', processed_count, index_count, percentage_processed) Log.d('fed predictors on {} new rows (now {} in total) in {:.1f}s', index_count, len(df), time.time() - handle_start_time) try: h5_start_time = time.time() with pd.HDFStore(self.h5_out_filepath, mode='a') as h5: h5.append(jobuid, new_df, format='table', data_columns=True) row_count = h5.get_storer(jobuid).nrows Log.d('h5 row count is now: {}', row_count) Log.d('appended {}/{} rows to hdf5 in {:.1f}s', index_count, len(df), time.time() - h5_start_time) except Exception as h5e: raise Exception('Failed to write to hdf file') from h5e except Exception as e: raise Exception('Failed to handle epoch') from e Log.d('finished handling block')
def parse_and_persist_as_transaction_maybe(datafetch_api_response, parser, db): try: transaction = ParseUtil.__parse_and_persist_as_transaction( datafetch_api_response, parser, db) except DuplicateInsertException as e: Log.w('db rejected transaction as a duplicate: {}', datafetch_api_response) return False except Exception as e: Log.e( 'Failed to parse and store transaction from api_response: {}', datafetch_api_response) raise e return True
def __run(self): Log.d('Watching file: {}', self.h5_filepath) thread = FileWatcher(self.h5_filepath, modified=self.handle_change).run_async() try: while self.handle_event.wait(): if self.last_handle_count is not None: jump_count = self.handle_count - self.last_handle_count if jump_count > 1: Log.w( 'handle count has jumped {} times than once since the last processing', jump_count) self.last_handle_count = self.handle_count self.process_h5() self.handle_event.clear() finally: Log.w('run loop broken, unwatching file: {}', self.h5_filepath) thread.stop() thread.join()
def upload_to_s3(channel, filepath, skip_if_name_and_size_matches=False): file = Path(filepath) """From SM examples. Like here: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-transfer-learning.ipynb""" s3 = boto3.resource('s3') key = channel + '/' + file.name bucket_ref = s3.Bucket(bucket) objs = list(bucket_ref.objects.filter(Prefix=key)) is_file_already_existing = len(objs) > 0 and objs[0].key == key if is_file_already_existing is True: if skip_if_name_and_size_matches is True: s3_client = boto3.client('s3') response = s3_client.head_object(Bucket=bucket, Key=key) local_size = file.stat().st_size remote_size = response['ContentLength'] if remote_size == local_size: Log.w('skipping upload as s3 key of same size ({:.2f}kb) already exists: {}', local_size/1000, key) return Log.w('overwriting existing s3 key: {}', key) with open(filepath, "rb") as data: s3.Bucket(bucket).put_object(Key=key, Body=data)
def __verify_datafetch_apis_write_frequency(self): Log.d('watcher check initiating') datafetch_apis_frame = self.db.datafetch_api_view_frame() if datafetch_apis_frame.empty: Log.d('no datafetch apis to watch') else: exceed_count = 0 for i, row in datafetch_apis_frame.iterrows(): datafetch_api_id = row['id'] write_idle_seconds = row['write_idle_seconds'] result_frequency_seconds = row['result_frequency_seconds'] if write_idle_seconds > result_frequency_seconds: exceed_count += 1 idle_time_str = Timespan.from_seconds( write_idle_seconds).as_string() warn_message = 'datafetch api id {} has exceeded its {} second limit (idle time {})'.format( datafetch_api_id, result_frequency_seconds, idle_time_str) Log.w(warn_message) Log.d('watch check done, exceed count: {}', exceed_count)
def write_csv(self, df): if self.write_count > 0: Log.w('ignoring csv write because it has already been performed') return X_all, y_all = self.frame_to_ml_inputs(df, do_filter=True) assert len(X_all) == len(y_all) if X_all.empty: Log.w('no rows to write!') return y_null_count = y_all.isnull().sum() assert y_null_count == 0, 'null count: {}'.format(y_null_count) X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=123) Log.d('X train shape: {}, X test shape: {}', X_train.shape, X_test.shape) train = pd.concat([X_train, y_train], axis=1) test = pd.concat([X_test, y_test], axis=1) is_first_write = (self.write_count == 0) for frame, filepath in ((train, self.train_filepath), (test, self.test_filepath)): Log.d('writing csv: {}', filepath) frame.to_csv(filepath, sep=',', na_rep='', index=False, header=is_first_write, decimal='.', mode='+a')#, index=False) with open(self.meta_filepath, 'w') as f: f.write(json.dumps( { 'train_filename': Path(self.train_filepath).name, 'test_filename': Path(self.test_filepath).name, 'train_observation_count': len(X_train), 'test_observation_count': len(X_test), 'feature_count': X_all.shape[1] }, indent=4#, sort_keys=True )) self.write_count += 1 Log.i('done writing csv file, write count is now: {}', self.write_count) if self.is_train_async is True: Log.d('propagating notification that csv has been written') self.csv_changed_event.set() else: self.create_predictor_from_csv()
def __init__(self, csv_filepath, min_row_count, is_train_async, min_predict_generator_size): super().__init__(predict_col='feature_rtrspc()_next_trend_pricefeature') csv_file = Path(csv_filepath) assert csv_file.is_dir(), csv_filepath self.csv_filepath = csv_filepath self.write_count = 0 self.predict_count = 0 self.train_filepath = os.path.join(csv_filepath, 'sagemaker_train.csv') self.test_filepath = os.path.join(csv_filepath, 'sagemaker_test.csv') self.meta_filepath = os.path.join(csv_filepath, 'sagemaker.json') if Path(self.train_filepath).exists(): os.remove(self.train_filepath) if Path(self.test_filepath).exists(): os.remove(self.test_filepath) if Path(self.meta_filepath).exists(): os.remove(self.meta_filepath) self.predictor = None self.is_train_async = is_train_async self.csv_changed_event = Event() if is_train_async else None self.min_predict_generator_size = min_predict_generator_size if is_train_async is True: Log.w('oops is_train_async, we dont allow yet') exit() Thread(target=self.csv_write_event_handler).start()
def process_transaction_subset(self, transaction_min_timestamp, set_size, hdf5_filepath, job_changed_handler, is_realtime): assert job_changed_handler is not None, 'no job_changed_handler provided' window_size = 10 subset_process_start_time = time.time() frame = self.db.transaction_by_timestamp_frame( transaction_min_timestamp, set_size, self.from_currency_ids, self.to_currency_ids) frame.set_index('epoch_time', inplace=True) row_count = frame.shape[0] Log.d('...time spent fetching subset ({} rows) from db: {:.2f}s', row_count, time.time() - subset_process_start_time) if row_count == 0: return None row_process_count = 0 last_epoch_time = None Log.d('...processing rows...') row_process_start_time = time.time() gap_resolver = self.run_config['gap_resolver'] for epoch_time, row in frame.iterrows(): is_row_processed = False try: transaction_id = row['id'] datasource_id = row['datasource_id'] exchange_id = row['exchange_id'] from_currency_id = row['from_currency_id'] to_currency_id = row['to_currency_id'] price = np.float64(row['price']) volume = np.float64(row['volume']) transaction_min_timestamp = epoch_time #transaction_id + 1 seconds_since_previous = 0 if last_epoch_time is None else epoch_time - last_epoch_time Log.t('seconds since previous epoch time: {}', seconds_since_previous) if last_epoch_time is not None: assert epoch_time >= last_epoch_time, 'epoch time ({}) was less than the previous epoch time ({})'.format( epoch_time, last_epoch_time) seconds_since_previous = 0 if last_epoch_time is None else epoch_time - last_epoch_time assert seconds_since_previous >= 0, 'seconds_since_previous cannot be a negative value' last_epoch_time = epoch_time for job in self.jobs: if (job.datasource.id == datasource_id and job.exchange.id == exchange_id and job.from_currency.id == from_currency_id and job.to_currency.id == to_currency_id): is_row_processed = True try: h5frame = job.frame if h5frame is not None: # perfrom integrity check on existing = non-empty dataframe assert not h5frame.empty # should not be possible if the frame has previously been created last_epoch = h5frame.index.values[-1] seconds_since_previous = epoch_time - last_epoch assert seconds_since_previous >= 0 max_gap_seconds = 120 # TODO make config setting if (seconds_since_previous > max_gap_seconds ): # TODO make config setting warn_message = 'excessive time (+{}s) passed since previous observation: {}s ({}) between {} ({}) and {} ({})'.format( max_gap_seconds, seconds_since_previous, Timespan.from_seconds( int(seconds_since_previous) ).as_string(), last_epoch, StringExpert.format_timestamp( last_epoch), epoch_time, StringExpert.format_timestamp( epoch_time)) if gap_resolver is None: raise Exception(warn_message) Log.w(warn_message) prev_observation = h5frame.iloc[-1] df_intermediates = gap_resolver.intermediates_frame( max_gap_seconds, from_epoch=last_epoch, to_epoch=epoch_time, from_price=prev_observation['latest'], to_price=price, from_volume=prev_observation['volume'], to_volume=volume) Log.d( 'simulating intermediate observations:\n{}', df_intermediates) simulated_count = 0 for intermediate_epoch, intermediate in df_intermediates.iterrows( ): job_observation = job.job_observe( value=intermediate['price'], epoch_time=intermediate_epoch, volume=intermediate['volume'], is_simulated=True, is_realtime=False) assert job_observation is not None simulated_count += 1 if simulated_count % 1000 == 0: Log.d('..simulated {}/{}..', simulated_count, len(df_intermediates)) Log.i( 'done simulating {} observations up until epoch {} ({})', len(df_intermediates), epoch_time, StringExpert.format_timestamp( epoch_time)) try: job_observation = job.job_observe( value=price, epoch_time=epoch_time, volume=volume, is_simulated=False, is_realtime=is_realtime) row = job_observation # job_observation_to_frame_row(volume, job_observation) assert row is not None job_changed_handler(job) except DoubleObservationError as doe: Log.w( 'epoch already in frame, will be ignored ({})', epoch_time) except Exception as job_e: raise Exception( 'Failed to feed row to job') from job_e except Exception as e: raise Exception( 'Failed to process row index {}'.format(epoch_time)) from e if is_row_processed: row_process_count += 1 Log.d('...time spent processing {}/{} rows in time: {:.2f}s', row_process_count, frame.shape[0], time.time() - row_process_start_time) with pd.HDFStore(hdf5_filepath, mode='a') as h5: h5_process_start_time = time.time() start_observation_epoch = frame.index.values[0] for job in self.jobs: df_to_append = job.frame[ job.frame.index >= start_observation_epoch] try: h5.append(job.uid, df_to_append, format='table', data_columns=True) row_count = h5.get_storer(job.uid).nrows Log.d('...h5 key {}, row count is {}', job.uid, row_count) except Exception as append_error: raise append_error Log.d('...time spent adding to h5: {:.2f}s', time.time() - h5_process_start_time) row_processing_time = time.time() - subset_process_start_time Log.d('...total time spent on subset: {:.2f}s ({:.2f}s per row)', row_processing_time, row_processing_time / row_process_count) return transaction_min_timestamp
def frame_to_ml_inputs(self, df, do_filter=False, one_hot=True, max_train_size=None): assert self.predict_col in df, 'prediction column "{}" does not exist in columns {}'.format( self.predict_col, df.columns.values) predict_col = next(c for c in df.columns.values if c == self.predict_col) active_columns = [ c for c in df.columns.values if c.endswith('_active') ] if do_filter is True: #TODO: are empty feature rows filtered out?? Log.d( 'filtering out rows with all feature values empty or predictor column empty, if needed' ) df_no_all_empty_feature_values = df.loc[( df[active_columns] == 1).all(axis=1)] dropped = df[~df.index.isin(df_no_all_empty_feature_values.index)] if len(dropped) > 0: Log.w('all-cols-empty drop row count: {}', len(dropped)) df_no_empty_predict = df_no_all_empty_feature_values.dropna( subset=[predict_col]) # no point in using empty rows dropped = df_no_all_empty_feature_values[ ~df_no_all_empty_feature_values.index.isin(df_no_empty_predict. index)] if len(dropped) > 0: Log.w('empty-predict-col drop row count: {}', len(dropped)) df_filtered = df_no_empty_predict keep_ratio = len(df_filtered) / len(df) Log.d('frame row ratio after filtering: {}/{} = {:.2f}', len(df_filtered), len(df), keep_ratio) assert keep_ratio < 1 else: df_filtered = df if max_train_size is not None: count_before = len(df_filtered) df_filtered = df_filtered.head(max_train_size) drop_count = count_before - len(df_filtered) Log.d('row count after train max size trim: {} - {} = {}', count_before, drop_count, len(df_filtered)) float_feature_cols = set( c for c in df.columns.values if not c.endswith(predict_col) and c.endswith('_feature')) category_feature_cols = set() for feature, postfix_pattern in feature_postfix_map.items(): assert issubclass( feature, IntEnum), 'feature {} is not an int enum'.format(feature) cols = [ c for c in df.columns.values if not c.endswith(predict_col) and postfix_pattern.match(c) ] enum_values = [int(k) for k in feature] if one_hot is not True: category_feature_cols.update(cols) else: kwargs = {} for col in cols: for enum_value in enum_values: enum_name = feature(enum_value).name enum_value_col = '{}_{}'.format(col, enum_name.lower()) kwargs[enum_value_col] = ( df_filtered[col] == enum_value).astype(np.float64) category_feature_cols.add(enum_value_col) df_filtered = df_filtered.assign(**kwargs) feature_cols = list(float_feature_cols | category_feature_cols) assert len(feature_cols) > 0 y_all = df_filtered[predict_col] assert not y_all.isnull().values.any( ), 'one or more values in the predict series were not specified' X_all = df_filtered[feature_cols] X_all = X_all.fillna(0) assert not X_all.isnull().values.any( ), 'one or more values in the input frame were not specified, although they should\' been na-filled with zeros' return X_all, y_all
def __create_predictor(self, df): Log.i('creating predictor on {} rows', len(df)) assert not df.empty kfold = StratifiedKFold(n_splits=10) random_state = 2 classifiers = [] classifiers.append(SVC(random_state=random_state)) classifiers.append(DecisionTreeClassifier(random_state=random_state)) classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1)) classifiers.append(RandomForestClassifier(random_state=random_state)) classifiers.append(ExtraTreesClassifier(random_state=random_state)) classifiers.append(GradientBoostingClassifier(random_state=random_state)) classifiers.append(MLPClassifier(random_state=random_state)) classifiers.append(KNeighborsClassifier()) classifiers.append(LogisticRegression(random_state = random_state)) classifiers.append(LinearDiscriminantAnalysis()) X_all, y_all = self.frame_to_ml_inputs(df, do_filter=True, max_train_size=self.max_train_size) if X_all.empty: Log.w('could not create predictor as the preprocessing resulted in an empty dataframe') return X_train, X_test, Y_train, Y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=random_state) Log.d('train shape: X: {}, y: {}', X_train.shape, Y_train.shape) cv_results = [] for classifier in classifiers : Log.d('performing cross val score for predictor {}', classifier) start_time = datetime.now() cv_results.append( cross_val_score(classifier, X_train, y = Y_train, scoring = 'accuracy', cv = kfold, n_jobs=core_count) ) Log.d('..done, time spent: {}', datetime.now() - start_time) cv_means = [] cv_std = [] for cv_result in cv_results: cv_means.append(cv_result.mean()) cv_std.append(cv_result.std()) cv_res = pd.DataFrame({ 'CrossValMeans': cv_means, 'CrossValerrors': cv_std, 'Algorithm': [ 'SVC', 'DecisionTree', 'AdaBoost', 'RandomForest', 'ExtraTrees', 'GradientBoosting', 'MultipleLayerPerceptron', 'KNeighboors', 'LogisticRegression', 'LinearDiscriminantAnalysis' ]}) Log.d('cross val results:\n{}', cv_res) g = sns.barplot('CrossValMeans','Algorithm',data = cv_res, palette='Set3',orient = 'h',**{'xerr':cv_std}) g.set_xlabel('Mean Accuracy') g = g.set_title('Cross validation scores') Log.i('saving plot..') plt.savefig('!eb1_cross_val_score.png', edgecolor='none', format="png") DTC = DecisionTreeClassifier() adaDTC = AdaBoostClassifier(DTC, random_state=7) ada_param_grid = {'base_estimator__criterion' : ['gini', 'entropy'], 'base_estimator__splitter' : ['best', 'random'], 'algorithm' : ['SAMME','SAMME.R'], 'n_estimators' :[1,2], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]} gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsadaDTC.fit(X_train,Y_train) ada_best = gsadaDTC.best_estimator_ gsadaDTC.best_score_ ExtC = ExtraTreesClassifier() ex_param_grid = {'max_depth': [None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [False], 'n_estimators' :[100,300], 'criterion': ['gini']} gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsExtC.fit(X_train,Y_train) ExtC_best = gsExtC.best_estimator_ Log.d('gsExtC.best_score_: {}', gsExtC.best_score_) RFC = RandomForestClassifier() rf_param_grid = {'max_depth': [None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [False], 'n_estimators' :[100,300], 'criterion': ['gini']} gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsRFC.fit(X_train,Y_train) RFC_best = gsRFC.best_estimator_ Log.d('gsRFC.best_score_: {}', gsRFC.best_score_) GBC = GradientBoostingClassifier() gb_param_grid = {'loss' : ['deviance'], 'n_estimators' : [100,200,300], 'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 8], 'min_samples_leaf': [100,150], 'max_features': [0.3, 0.1] } gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsGBC.fit(X_train,Y_train) GBC_best = gsGBC.best_estimator_ Log.d('gsGBC.best_score_: {}', gsGBC.best_score_) SVMC = SVC(probability=True) svc_param_grid = {'kernel': ['rbf'], 'gamma': [ 0.001, 0.01, 0.1, 1], 'C': [1, 10, 50, 100,200,300, 1000]} gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1) gsSVMC.fit(X_train,Y_train) SVMC_best = gsSVMC.best_estimator_ Log.d('gsSVMC.best_score_: {}', gsSVMC.best_score_) Log.w('quitting') exit()
def h5_to_plot(h5, from_epoch, to_epoch, filterInNth, agents, format_as_image): Log.d('============') Log.d(agents) agent_keys = [a for a in agents.split(',') if a] if len(agent_keys) == 0: return 'No agent selected' filterInNth = int(filterInNth) df_info = '' pd.options.display.float_format = '{:.2f}'.format df_info += 'No agent selected\n\n{}\n\n'.format(h5.info()) for key in h5: where = 'index >= {} and index <= {}'.format(from_epoch, to_epoch) Log.d('where: {}', where) frame = pd.read_hdf(h5, key, where=where) if frame.empty == True: return 'Empty frame' df_info += '{}\n\n'.format(frame.describe()) background_color = '#272822' minute_intervals = [ 12 * 60, # 12 hours ] x = range(100) y = [a * 2 + random.randint(-20, 20) for a in x] fig, ax = plt.subplots(figsize=(23, 12)) #figsize=(28,21)) fig.patch.set_facecolor(background_color) Log.t('building plot') is_image_format = int(format_as_image) == True def label_connect(path_collection, labels, color=None): tooltip = mpld3.plugins.PointHTMLTooltip(path_collection, [ '<span class="point-tooltip" style="color: {}">{} <span class="point-tooltip-key">{}<span><span>' .format(color, l, key) for l in labels ], voffset=100, hoffset=0) mpld3.plugins.connect(fig, tooltip) for agent_key in agent_keys: try: agent_name = agent_key.split('(')[0] Log.d('plotting agent: {} -> {}', agent_key, agent_name) agent = agent_map[agent_name] plot_title = '' col_prefix = 'feature_{}_'.format(agent_key) agent_plot = agent.plot(plot_title, None, frame, ax, is_image_format, label_connect=label_connect, filter_in_nth=filterInNth, cp=col_prefix) pe.style_plot(ax, plot_title) except KeyError as ke: Log.w('Valid keys are: {}', frame.keys()) raise ke plot_dirpath = AppConfig.setting('PLOT_DIRPATH') plot_filepath = os.path.join(plot_dirpath, '{}.png'.format('some plot')) fig.patch.set_facecolor(style.backgroundColor) fig.tight_layout() if is_image_format == True: sio = BytesIO() fig.savefig(sio, facecolor=fig.get_facecolor(), edgecolor='none', format="png") html = '<img src="data:image/png;base64,{}"/>'.format( base64.encodebytes(sio.getvalue()).decode()) return html mpld3.plugins.connect(fig, ZoomSizePlugin()) return mpld3.fig_to_html(fig) raise 'hmmm'
def job_observe(self, value, epoch_time, volume, is_simulated, is_realtime): if self.interval_stat is None: self.interval_stat = IntervalStat(self.interval_second, epoch_time) if self.frame is None: # TODO: must to constructor self.frame = self.emptyFrame(self.interval_stat) istat = self.interval_stat if epoch_time in self.frame.index: raise DoubleObservationError(epoch_time) is_first_value_in_interval = epoch_time >= istat.interval_end_epoch if is_first_value_in_interval: self.frame.iloc[-1, self.frame.columns.get_loc('is_closing')] = 1 istat.reset(epoch_time) istat.interval_observe(value=value, epoch_time=epoch_time) interval_value = istat.as_dict() assert volume is not None assert isinstance(volume, np.float64) self.frame.at[epoch_time] = { **interval_value, 'volume': volume, 'age': 0 if self.frame.empty else epoch_time - self.frame.index.values[0], 'is_simulated': 1 if is_simulated is True else 0, 'is_realtime': 1 if is_realtime is True else 0 } total_time_spent = 0 for f in self.features: try: start_time = time.time() feature_observation = f.feature_observe( epoch_time=epoch_time, interval_value=interval_value, history_frame=self.frame, is_first_value_in_interval=is_first_value_in_interval, is_reset_observation=False) time_spent = time.time() - start_time self.frame.at[epoch_time, f.col_prefix + 'time'] = time_spent if feature_observation is not None: assert isinstance(feature_observation, pd.Series) or isinstance( feature_observation, dict) kv_pairs = feature_observation.iteritems() if isinstance( feature_observation, pd.Series) else feature_observation.items() for key, value in kv_pairs: if value is None: Log.w( 'feature agent returned the None value for job uid {}, feature {} and pair key {}', self.uid, f.NAME, key) elif value is not nan: colname = f.col_prefix + key assert colname in self.frame self.frame.at[epoch_time, colname] = value total_time_spent += time_spent except Exception as e: raise Exception('Failed to feed value to feature "{}"'.format( f.col_prefix)) self.frame.at[epoch_time, 'time'] = total_time_spent if len(self.frame) % 500 == 0: self.print_stats() return self.frame.loc[epoch_time]