def detect_and_parse_new_disk_files_async(self): Log.i('asynchronously detecting and parsing new disk files') event_handler = DirWatcher(self.handle_file_created) self.observer = Observer() self.observer.schedule(event_handler, self.dir_path, recursive=False) self.observer.start() return self.observer
def __init__(self, h5_filepath, version): warnings.simplefilter('ignore', NaturalNameWarning) h5_inputfile = Path(h5_filepath) output_dirpath = AppConfig.setting('PREDICTOR_DATA_DIRPATH') self.h5_out_filepath = os.path.join(output_dirpath, h5_inputfile.name) h5_out_file = Path(self.h5_out_filepath) if h5_out_file.exists(): Log.i('overwrite file?: {}', h5_out_file) if not OsExpert.prompt_confirm('File already exists, overwrite? {}'.format(h5_out_file)): Log.d('user aborted, exiting') exit() Log.w('removing file: {}', h5_out_file) os.remove(self.h5_out_filepath) self.predictors_map = {} base_filepath = output_dirpath with pd.HDFStore(h5_filepath, mode='r') as h5: keys = h5.keys() Log.i('h5 input keys: {}', keys) assert len(keys) == 1, 'harcoded restriction on single key was violated' for key in keys: Log.i('row count for {}: {}', key, h5.get_storer(key).nrows) self.predictors_map[key] = [ EnsemblePredictor(min_predict_generator_size=2000, max_train_size=5000) ] self.h5_watcher = H5FileWatcher(h5_filepath, self.handle_job_epoch, {'is_simulated': 0})
def email(sender, receiver, title, text, smtp_host=None, smtp_user=None, smtp_password=None, smtp_port=587): try: if smtp_host is None: smtp_host = AppConfig.setting('SMTP_HOST') if smtp_user is None: smtp_user = AppConfig.setting('SMTP_USER') if smtp_password is None: smtp_password = AppConfig.setting('SMTP_PASSWORD') msg = EmailMessage() msg.set_content(text) msg['Subject'] = title msg['From'] = sender #Address(display_name='Recipient', addr_spec='*****@*****.**') msg['To'] = receiver Log.t('sending email') with smtplib.SMTP(host=smtp_host, port=smtp_port) as smtp_server: smtp_server.starttls(context=SSLContext(PROTOCOL_TLSv1_2)) smtp_server.login(user=smtp_user, password=smtp_password) smtp_server.send_message(msg) smtp_server.quit() Log.t('sent email') except Exception as e: raise Exception('Failed to send email') from e
async def check_for_alert_match(self): urls = [ 'https://twitter.com/CFTC', 'https://twitter.com/sec_enforcement?lang=en', 'https://twitter.com/ushouserep?lang=en' ] strip_texts = None with open('ignore-lines.json', 'r') as f: strip_texts = json.load(f) Log.d('checking {} sources, ignoring {} lines..', len(urls), len(strip_texts)) patterns = [ r'.{,200}bitcoin.{,200}', r'.{,200}crypto.{,200}', r'.{,200}virtual currency.{,200}', ] for url in urls: async with aiohttp.ClientSession() as session: html_text = await self.__fetch(session, url) text = StringExpert.strip_tags(html_text) text = html.unescape(text) for strip_text in strip_texts: text = text.replace(strip_text, '') for pattern in patterns: match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE) if match is not None: matched_line = match.group() warning = 'Found pattern "{}" at url "{}" in line: {}'.format(pattern, url, matched_line) Log.w(warning) return True return False
def __init__(self): super().__init__(__file__) Log.d('construct') self.dir_path = AppConfig.setting('DATA_RESPONSE_DIRPATH') self.store = Store() self.subscribers = subscribe.all() self.parse_util = ParseUtil(self.subscribers, self.store)
def retrieve(db, url, datasource_id, exchange_id, currency_id): temp_dirpath=AppConfig.setting('TEMP_DIRPATH') filepath = os.path.join(temp_dirpath, url.split('/')[-1]) downloadFile(url, filepath) duplicateCount = 0 insertCount = 0 with gzip.open(filepath, 'rt') as f: Log.d('Processing csv file..') spamreader = csv.reader(f, delimiter=',', quotechar='|') for row in spamreader: timeStr = row[0] epochTime = int(timeStr) priceStr = row[1] price = float(priceStr) amountStr = row[2] amount = float(amountStr) transaction = { 'datasource_id': datasource_id, 'exchange_id': exchange_id, 'amount': amount, 'price': price, 'currency_id': currency_id, 'epoch_time': epochTime, } try: db.create_transaction(transaction) insertCount += 1 except DuplicateInsertException as e: duplicateCount += 1 os.remove(filepath) Log.i('Done processing, insert count: {}, duplicate count: {}', insertCount, duplicateCount)
def process_h5(self): with pd.HDFStore(self.h5_filepath, mode='r') as h5: for jobuid in h5: is_first_encounter = jobuid not in self.job_frames if is_first_encounter == True: self.job_frames[jobuid] = pd.read_hdf( h5, jobuid, start=0, stop=1) # fetch first row to get the first index/epoch job_df = self.job_frames[jobuid] latest_epoch = job_df.index.values[ -1] # will ensure we don't 'miss' any rows in case the handle count jumps more than once where_clause = 'index > {} {}'.format(latest_epoch, self.contraints_clause) new_df = pd.read_hdf(h5, jobuid, where=where_clause) if new_df.empty: Log.w('dataset was empty for key {} and index > {}', jobuid, latest_epoch) else: assert new_df.index.values[0] > latest_epoch new_first_index = 0 if is_first_encounter == True else len( job_df) joined = pd.concat([job_df, new_df]) self.job_frames[jobuid] = joined if len(joined) > 100000: Log.w( 'holding a dataset of significant length ({} rows, {:.1f}mb): {}', len(joined), joined.memory_usage().sum() / 1_000_000, jobuid) assert joined.shape[0] == len(job_df) + len(new_df) self.ensure_strictly_increasing_index( joined) # TODO: remove once this is confirmed self.row_handler(jobuid, joined, new_first_index)
def __init__(self, min_predict_generator_size, max_train_size): super().__init__(predict_col='feature_rtrspc()_next_trend_pricefeature') assert max_train_size > min_predict_generator_size self.min_predict_generator_size = min_predict_generator_size self.max_train_size = max_train_size self.predictor = None Log.d('core count: {}', core_count)
async def subscribe(self): try: async for response_text in self.__socket_subscribe(): Log.t('received text: {}', response_text) yield response_text except Exception as e: error_msg = 'Failed to subscribe for handler filepath {}'.format(self.handler_filepath) raise Exception(error_msg) from e
def filter_simulated_observations(self, df): filtered_df = df[df['is_simulated'] != 1] dropped = df[~df.index.isin(filtered_df.index)] if len(dropped) > 0: Log.w('filtered out {} simulated frames', len(dropped)) else: Log.d('no simulated frames found to filter out') return filtered_df
def __parse_and_persist_as_transaction(row, parser, db): parsed = parser.parse(row) if parsed is None: return None parser_datasource_name = parser.datasource_name id = db.create_transaction(parsed) Log.t('persisted transaction id {}', id) return parsed
def __init__(self): super().__init__(__file__, isToNotifyStartup=False) self.maxEmailReccurenceMinutes = float( AppConfig.setting('LOGWATCH_EMAIL_MAX_RECCURENCE_MINUTES')) self.triggerLines = ['ERROR', 'WARNING'] Log.d('construct: {}', self.__dict__) self.matchCountSinceLastEmail = 0 self.lastEmailDatetime = None
def tryAppNotifyByEmail(serviceName, messsage): if AppConfig.setting('IS_EMAIL_NOTIFICATION_ENABLED') != '1': Log.d('ignoring email request per configuration') return False alertEmail = AppConfig.setting('ALERT_EMAIL') hostName = socket.gethostname() return NetworkExpert.emailMaybe( alertEmail, alertEmail, '*** {}: {} ***'.format(hostName, serviceName), messsage)
async def alert_continuously(self, alert_interval_seconds): is_triggered = False while True: #is_triggered == False: try: is_triggered = await self.check_for_alert_match() except Exception as e: stacktrace = OsExpert.stacktrace() Log.e('Failed to run alert check, stacktace:\n{}', stacktrace) await asyncio.sleep(alert_interval_seconds)
def activateSubscribers(self): subscriber_count = len(self.subscribers) Log.i('activating {} subscriber(s)', subscriber_count) loop = asyncio.get_event_loop() futures = [self.__process_subscriber(i, s) for i,s in enumerate(self.subscribers)] tasks = asyncio.gather(*futures) loop.run_until_complete(tasks) loop.close() Log.i('done processing subscribers')
def __init__(self): super().__init__(__file__) Log.d('construct') retry_delay_seconds = int(AppConfig.setting('DATAFETCH_API_RETRY_DELAY_SECONDS')) data_response_dirpath = AppConfig.setting('DATA_RESPONSE_DIRPATH') Log.d('data response dirpath is: {}', data_response_dirpath) self.retry_delay_seconds = retry_delay_seconds self.data_response_dirpath = data_response_dirpath OsExpert.ensure_abs_dirpath_exists(data_response_dirpath) self.subscribers = subscribe.all()
def __init__(self, h5_filepath, row_handler, contraints_dict=None): self.handle_event = Event() self.h5_filepath = h5_filepath self.handle_count = 0 self.job_frames = {} self.last_handle_count = None self.row_handler = row_handler self.contraints_clause = '' if contraints_dict is None else ' '.join( 'and {}={}'.format(k, v) for k, v in contraints_dict.items()) Log.d('cc: {}', self.contraints_clause) assert row_handler
def downloadFile(url, filepath): if url is None: raise ValueError('parameter "value" not specified') if filepath is None: raise ValueError('parameter "filepath" not specified') Log.d('Downloading to path {}: {}'.format(filepath, url)) r = requests.get(url, stream=True) # NOTE the stream=True parameter with open(filepath, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk)
def __predict(self, df): max_prediction_count = 100 if self.predict_count >= max_prediction_count: Log.w('too many predictions {} reached, exiting', self.predict_count) exit() assert len(df) == 1 X_all, y_all = self.frame_to_ml_inputs(df) predict_row = X_all.iloc[0] Log.d('predicting based on {} values:\n{}', len(predict_row.values), predict_row.squeeze().sort_index()) prediction_response = self.predictor.predict(predict_row.values) prediction = self.sagemaker_response_highest_score_label(prediction_response) self.predict_count += 1 return prediction
def print_acc(self, df): Log.d('begin acc calc ======') y_predict_colname = 'prediction_ensmbl_next_trend_feature' #'prediction_awsdnn_next_trend' y_true_colname = 'feature_rtrspc()_next_trend_pricefeature' df = df[[y_predict_colname, y_true_colname]] filtered = df.dropna(how='any') Log.d('acc source frame:\n{}', filtered) Log.d('dropped {}/{} rows where either the predictor or the true value was unspecified', len(df) - len(filtered), len(df)) y_predict = filtered[y_predict_colname] y_true = filtered[y_true_colname] score = accuracy_score(y_true, y_predict, normalize=True) Log.d('accuracy: {}', score) Log.d('===== end acc calc ')
def unparsed_datafetch_api_responses_frame(self, min_id=0, limit=100): sql = """ SELECT {0}.* FROM {0} LEFT OUTER JOIN {1} ON {1}.source_md5hash = {0}.response_md5hash WHERE {1}.source_md5hash IS NULL AND {0}.id >= {2} ORDER BY {0}.id LIMIT {3} """.format('datafetch_api_response', 'transaction', min_id, limit) Log.d('executing:\n{}', sql) sys.stdout.flush() return self.__query_frame(sql)
def email_maybe(self, header, message): now = datetime.now() if self.lastEmailDatetime is not None: minutesSinceLastEmail = ( now - self.lastEmailDatetime).total_seconds() / 60.0 if minutesSinceLastEmail < self.maxEmailReccurenceMinutes: timeLeftMinutes = int(self.maxEmailReccurenceMinutes - minutesSinceLastEmail) Log.d( 'Aborting email notification ({}+ minutes left in window)', timeLeftMinutes) return self.lastEmailDatetime = now self.matchCountSinceLastEmail = 0 NetworkExpert.tryAppNotifyByEmail(header, message)
def parse_and_persist_as_transaction_maybe(datafetch_api_response, parser, db): try: transaction = ParseUtil.__parse_and_persist_as_transaction( datafetch_api_response, parser, db) except DuplicateInsertException as e: Log.w('db rejected transaction as a duplicate: {}', datafetch_api_response) return False except Exception as e: Log.e( 'Failed to parse and store transaction from api_response: {}', datafetch_api_response) raise e return True
def emailMaybe(sender, receiver, title, text, smtp_host=None, smtp_user=None, smtp_password=None, smtp_port=587): try: NetworkExpert.email(sender, receiver, title, text, smtp_host, smtp_user, smtp_password, smtp_port) return True except Error: Log.e('Failed to send email') return False
def frame(mode, filename, from_epoch, to_epoch, filterInNth, agents, format_as_image): dirpath = AppConfig.setting('GENERATOR_DATA_DIRPATH') filepath = os.path.join(dirpath, filename) if from_epoch is None: from_epoch = to_epoch - 60 * 60 * 24 * 7 with pd.HDFStore(filepath, mode='r') as h5: key = h5.keys()[0] # TODO: always select first? storer = h5.get_storer(key) row_count = storer.nrows Log.d(row_count) first_epoch = pd.read_hdf(h5, key, start=0, stop=1, columns=[]).index.values[0] last_epoch = pd.read_hdf(h5, key, start=row_count - 1, stop=row_count, columns=[]).index.values[0] column_names = [attr for attr in storer.attrs.data_columns] plot_html = h5_to_plot(h5, from_epoch, to_epoch, filterInNth, agents, format_as_image) if mode == 'plot_only': return plot_html feature_columns = set([ a.split('_')[1] for a in column_names if a.startswith('feature_') ]) feature_names = [c.split('(')[0] for c in feature_columns] agent_map = { fn: [c for c in feature_columns if c.startswith(fn)] for fn in feature_names } return render_template( 'frame.html', style=style, plothtml=plot_html, filename=filename, from_epoch=from_epoch, to_epoch=to_epoch, first_epoch=first_epoch, last_epoch=last_epoch, min_epoch=1514764800, max_epoch=int(time.time()), agent_map=sorted(agent_map.items()), # min epoch is 2018 job_uid=key, frame_info_html=json2html.convert(json={ 'row count': row_count, 'columns': column_names }))
def process_nonparsed_api_responses_full(self, sleep_seconds=0): Log.i( 'initiating continuous parsing of api responses with subset sleep interval: {} seconds', sleep_seconds) try: min_id = -1 next_min_id = 0 while next_min_id > min_id: min_id = next_min_id parse_count = 0 next_min_id = self.process_nonparsed_api_responses_subset( next_min_id=min_id) time.sleep(sleep_seconds) except Exception as e: raise Exception('Failed to process nonparsed api responses') from e transaction_count = self.store.transaction_count() Log.d('no more api responses to parse, transaction count is now {}', transaction_count)
def handle_job_epoch(self, jobuid, df, start_index): trade_fee = float64(.25 / 100) min_capital = self.initial_capital * trade_fee * 10 print(start_index) print(len(df)) try: assert jobuid == '/bitcoinaverage_multiple_global_ETH_USD_900', 'unexpected job id' new_df = df[start_index:] for epoch, row in new_df.iterrows(): action = row[PREDICT_ACTION] coin_price = row['close'] if self.start_value is None: self.start_value = self.current_value(coin_price) if not isnan(action): print('coin price ', coin_price, ', capital ', self.capital) if action == FeatureValue.BUY: coin_transaction_count = (1 - trade_fee) * (self.capital - min_capital) / coin_price if coin_transaction_count > 0: print('BUYING coins: ', coin_transaction_count) cost = coin_transaction_count * coin_price fee = cost * trade_fee assert self.capital >= cost + fee, '{} >= {} + {} = {}'.format(self.capital, cost, fee, cost + fee) self.capital -= cost self.coins += coin_transaction_count self.pay_fee(cost) elif action == FeatureValue.SELL: fee = min(self.coins * coin_price * trade_fee, self.capital) coin_transaction_count = fee / (coin_price * trade_fee) if coin_transaction_count > 0 and self.coins >= coin_transaction_count: print('SELLING coins: {}'.format(coin_transaction_count)) gain = coin_transaction_count * coin_price self.capital += gain self.coins -= coin_transaction_count self.pay_fee(gain) else: Log.d('NOT ENOUGH COINS TO SELL! {} at {}', coin_transaction_count, fee) net_worth = self.current_value(coin_price) except Exception as e: raise Exception('Failed to execute on new job epoch') from e print(len(df)) print(df[PREDICT_ACTION].value_counts()) print('done') sys.stdout.flush()
def __init__(self, version): super().__init__(__file__) self.window_size = 15 self.interval_seconds = [15 * 60] # 15 minutes self.contruct_time = time.time() self.version = version self.sleep_seconds = 1 # must be low enough to produce empty result set eventually > reaktime self.transaction_min_timestamp = int( AppConfig.setting('GENERATOR_TRANSACTION_MIN_TIMESTAMP')) self.data_dirpath = AppConfig.setting('GENERATOR_DATA_DIRPATH') Log.d('construct: {}', self.__dict__) self.db = DatabaseGateway() max_history_minutes = 10 * 24 * 60 #max(self.minute_intervals) self.from_currency_ids = [] self.to_currency_ids = [] self.run_config = self.read_run_config() self.jobs = list( self.__jobs_iterate(max_history_minutes, self.run_config)) Log.i('count of generator jobs: {}', len(self.jobs))
def reset(self, epoch_time): self.low = None self.high = None self.open = None self.close = None self.latest = None self.is_opening = False self.is_closing = False self.observation_count = 0 interval_position_epoch = epoch_time % self.interval_second # find the 'time passed' within the interval self.interval_start_epoch = epoch_time - interval_position_epoch self.interval_end_epoch = self.interval_start_epoch + self.interval_second interval = self.interval_start_epoch / self.interval_second assert interval % 1 == 0, 'interval index {} is not an integer'.format(interval) self.interval_index = int(interval) Log.t( self.interval_index, datetime.utcfromtimestamp(self.interval_start_epoch), datetime.utcfromtimestamp(self.interval_end_epoch) )
def upload_to_s3(channel, filepath, skip_if_name_and_size_matches=False): file = Path(filepath) """From SM examples. Like here: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-transfer-learning.ipynb""" s3 = boto3.resource('s3') key = channel + '/' + file.name bucket_ref = s3.Bucket(bucket) objs = list(bucket_ref.objects.filter(Prefix=key)) is_file_already_existing = len(objs) > 0 and objs[0].key == key if is_file_already_existing is True: if skip_if_name_and_size_matches is True: s3_client = boto3.client('s3') response = s3_client.head_object(Bucket=bucket, Key=key) local_size = file.stat().st_size remote_size = response['ContentLength'] if remote_size == local_size: Log.w('skipping upload as s3 key of same size ({:.2f}kb) already exists: {}', local_size/1000, key) return Log.w('overwriting existing s3 key: {}', key) with open(filepath, "rb") as data: s3.Bucket(bucket).put_object(Key=key, Body=data)