Ejemplo n.º 1
0
 def detect_and_parse_new_disk_files_async(self):
     Log.i('asynchronously detecting and parsing new disk files')
     event_handler = DirWatcher(self.handle_file_created)
     self.observer = Observer()
     self.observer.schedule(event_handler, self.dir_path, recursive=False)
     self.observer.start()
     return self.observer
Ejemplo n.º 2
0
def retrieve(db, url, datasource_id, exchange_id, currency_id):
	temp_dirpath=AppConfig.setting('TEMP_DIRPATH')
	filepath = os.path.join(temp_dirpath, url.split('/')[-1])
	downloadFile(url, filepath)
	duplicateCount = 0
	insertCount = 0
	with gzip.open(filepath, 'rt') as f:
		Log.d('Processing csv file..')
		spamreader = csv.reader(f, delimiter=',', quotechar='|')
		for row in spamreader:
			timeStr = row[0]
			epochTime = int(timeStr)
			priceStr = row[1]
			price = float(priceStr)
			amountStr = row[2]
			amount = float(amountStr)
			transaction = {
				'datasource_id': datasource_id,
				'exchange_id': exchange_id,
				'amount': amount,
				'price': price,
				'currency_id': currency_id,
				'epoch_time': epochTime,
			}
			try:
				db.create_transaction(transaction)
				insertCount += 1
			except DuplicateInsertException as e:
				duplicateCount += 1
	os.remove(filepath)
	Log.i('Done processing, insert count: {}, duplicate count: {}', insertCount, duplicateCount)
Ejemplo n.º 3
0
	async def __process_subscriber(self, index, subscriber):
		fail_count = 0
		response_file_prefix = subscriber.handler_filename
		while True:
			try:
				Log.i('invoking subscriber {}', subscriber.handler_filename)				
				async for response_text in subscriber.subscribe():
					response_text_md5hash = StringExpert.md5hash(response_text)					
					try:
						epoch = int(time.time())
						filepath = os.path.join(
							self.data_response_dirpath,
							'{}.{}.{}'.format(response_file_prefix, epoch, FetchApp.RESPONSE_EXTENSION)
							)
						with open(filepath, 'w') as file:
							file.write(response_text)
					except Exception as e:
						Log.e('Failed to save response to file, message: {}', e)
					Log.d('stored api response for subcriber {} (hash {})', subscriber.handler_filename, response_text_md5hash)
			except Exception as e:
				fail_count += 1
				Log.e('failed to invoke subscriber {} ({} failures so far)', subscriber.handler_filename, fail_count)
				stacktrace = traceback.format_exc()
				Log.d('exception stack:\n{}', stacktrace)
				Log.i('retrying in {} seconds..', self.retry_delay_seconds)
				await asyncio.sleep(self.retry_delay_seconds)
Ejemplo n.º 4
0
	def activateSubscribers(self):
		subscriber_count = len(self.subscribers)
		Log.i('activating {} subscriber(s)', subscriber_count)
		loop = asyncio.get_event_loop()
		futures = [self.__process_subscriber(i, s) for i,s in enumerate(self.subscribers)]
		tasks = asyncio.gather(*futures)
		loop.run_until_complete(tasks)
		loop.close()
		Log.i('done processing subscribers')
Ejemplo n.º 5
0
 def feed_jobs_forever(self, job_changed_handler):
     assert job_changed_handler is not None
     sleep_seconds = self.sleep_seconds
     transaction_min_timestamp = self.transaction_min_timestamp
     start_transaction_min_timestamp = transaction_min_timestamp
     data_dirpath = self.data_dirpath
     start_time = time.time()
     Log.i(
         'processing transactions, sleep interval {}s, starting from epoch {} ({})',
         sleep_seconds, transaction_min_timestamp,
         StringExpert.format_timestamp(transaction_min_timestamp))
     to_fetch_count = self.db.transaction_count(transaction_min_timestamp)
     Log.d('transaction count since {} ({}): {}', transaction_min_timestamp,
           StringExpert.format_timestamp(transaction_min_timestamp),
           to_fetch_count)
     pd.set_option('io.hdf.default_format', 'table')
     hdf5_filename = '{}_{}_{}.h5'.format(
         self.version.major, self.version.minor,
         datetime.fromtimestamp(start_time).strftime('%Y%m%d_%H%M%S'))
     hdf5_filepath = path.join(data_dirpath, hdf5_filename)
     Log.i('hdf5 output filepath is: \n{}', hdf5_filepath)
     set_size = 1000
     fetch_count = 0
     plot_time = time.time()
     is_realtime = False
     while True:
         try:
             next_transaction_min_timestamp = self.process_transaction_subset(
                 transaction_min_timestamp, set_size, hdf5_filepath,
                 job_changed_handler, is_realtime)
             if next_transaction_min_timestamp is None:
                 Log.d('nothing to process, waiting..')
                 is_realtime = True  # TODO: empty polling perhaps not the best indicator of switch to realtime
                 time.sleep(sleep_seconds)
             else:
                 assert next_transaction_min_timestamp > transaction_min_timestamp, 'next minimum timestamp was not greater than the current timestamp'
                 transaction_min_timestamp = next_transaction_min_timestamp
                 fetch_count += set_size
                 percentage = 100 * fetch_count / to_fetch_count
                 current_time = time.time()
                 Log.d(
                     'processed {}/{}, {}%, spent {} on the period {} ({}) to {} ({})',
                     fetch_count, to_fetch_count, int(percentage),
                     Timespan.from_seconds(int(current_time -
                                               start_time)).as_string(),
                     StringExpert.format_timestamp(
                         start_transaction_min_timestamp),
                     start_transaction_min_timestamp,
                     StringExpert.format_timestamp(
                         transaction_min_timestamp),
                     transaction_min_timestamp)
         except Exception as e:
             raise Exception(
                 'Failed to process nonparsed api responses') from e
     Log.w('all {} rows read, but should loop forever', row_count)
Ejemplo n.º 6
0
 def watch_continuously(self, watch_interval_seconds):
     Log.i('continuous watching activated with interval of {} seconds',
           watch_interval_seconds)
     consecutive_error_count = 0
     while True:
         try:
             self.__verify_datafetch_apis_write_frequency()
             consecutive_error_count = 0
         except Exception as e:
             consecutive_error_count += 1
             Log.e('fail during watcher check ({} consecutive errors)',
                   consecutive_error_count)
             stacktrace = OsExpert.stacktrace()
             Log.d('stacktrace:\n{}', stacktrace)
         time.sleep(watch_interval_seconds)
Ejemplo n.º 7
0
	def create_predictor_from_csv(self):
			Log.i('initiating sagemaker model creation')
			role = AppConfig.setting('AWS_PREDICTOR_ROLE')
			bucket='cryptrade-sagemaker'
			custom_code_upload_location = 's3://{}/customcode/tensorflow_iris'.format(bucket)
			model_artifacts_location = 's3://{}/artifacts'.format(bucket)
			Log.d('training data will be uploaded to: {}', custom_code_upload_location)
			Log.d('training artifacts will be uploaded to: {}', model_artifacts_location)
			sess = sagemaker.Session()
			def upload_to_s3(channel, filepath, skip_if_name_and_size_matches=False):
				file = Path(filepath)
				"""From SM examples. Like here: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-transfer-learning.ipynb"""
				s3 = boto3.resource('s3')
				key = channel + '/' + file.name
				bucket_ref = s3.Bucket(bucket)
				objs = list(bucket_ref.objects.filter(Prefix=key))
				is_file_already_existing = len(objs) > 0 and objs[0].key == key
				if is_file_already_existing is True:
					if skip_if_name_and_size_matches is True:
						s3_client = boto3.client('s3')
						response = s3_client.head_object(Bucket=bucket, Key=key)
						local_size = file.stat().st_size
						remote_size = response['ContentLength']
						if remote_size == local_size:
							Log.w('skipping upload as s3 key of same size ({:.2f}kb) already exists: {}', local_size/1000, key)
							return
					Log.w('overwriting existing s3 key: {}', key)
				with open(filepath, "rb") as data:
					s3.Bucket(bucket).put_object(Key=key, Body=data)
			s3_data_folder = 'data'
			upload_to_s3(s3_data_folder, self.train_filepath, True)
			upload_to_s3(s3_data_folder, self.test_filepath, True)
			upload_to_s3(s3_data_folder, self.meta_filepath)
			estimator = TensorFlow(
				entry_point='aws_dnn_predictor_entry.py',
				role=role,
				output_path=model_artifacts_location,
				code_location=custom_code_upload_location,
				train_instance_count=1,
				train_instance_type='ml.c5.xlarge',
				training_steps=1000,
				evaluation_steps=100
				)
			train_data_location = 's3://{}/{}'.format(bucket, s3_data_folder)
			Log.i('fitting train data: {}', train_data_location)
			estimator.fit(train_data_location)
			Log.i('deploying model')
			deploy_start = datetime.now()
			predictor = estimator.deploy(initial_instance_count=1,
			                                       instance_type='ml.t2.medium'
			                                       )
			deploy_end = datetime.now()
			Log.i('deployed predictor in {}s, endpoint is:\n{}', deploy_end - deploy_start, predictor.endpoint)
			
			self.predictor = predictor
Ejemplo n.º 8
0
 def process_nonparsed_api_responses_full(self, sleep_seconds=0):
     Log.i(
         'initiating continuous parsing of api responses with subset sleep interval: {} seconds',
         sleep_seconds)
     try:
         min_id = -1
         next_min_id = 0
         while next_min_id > min_id:
             min_id = next_min_id
             parse_count = 0
             next_min_id = self.process_nonparsed_api_responses_subset(
                 next_min_id=min_id)
             time.sleep(sleep_seconds)
     except Exception as e:
         raise Exception('Failed to process nonparsed api responses') from e
     transaction_count = self.store.transaction_count()
     Log.d('no more api responses to parse, transaction count is now {}',
           transaction_count)
Ejemplo n.º 9
0
 def __init__(self, version):
     super().__init__(__file__)
     self.window_size = 15
     self.interval_seconds = [15 * 60]  # 15 minutes
     self.contruct_time = time.time()
     self.version = version
     self.sleep_seconds = 1  # must be low enough to produce empty result set eventually > reaktime
     self.transaction_min_timestamp = int(
         AppConfig.setting('GENERATOR_TRANSACTION_MIN_TIMESTAMP'))
     self.data_dirpath = AppConfig.setting('GENERATOR_DATA_DIRPATH')
     Log.d('construct: {}', self.__dict__)
     self.db = DatabaseGateway()
     max_history_minutes = 10 * 24 * 60  #max(self.minute_intervals)
     self.from_currency_ids = []
     self.to_currency_ids = []
     self.run_config = self.read_run_config()
     self.jobs = list(
         self.__jobs_iterate(max_history_minutes, self.run_config))
     Log.i('count of generator jobs: {}', len(self.jobs))
Ejemplo n.º 10
0
 def run(self):
     emailHeader = 'LogWatchPipeApp input trigger match'
     while True:
         sys.stdout.flush()
         try:
             line = sys.stdin.readline()
         except KeyboardInterrupt:
             break
         if not line:
             break
         sys.stdout.write(line)
         for triggerLine in self.triggerLines:
             if triggerLine in line:
                 self.matchCountSinceLastEmail += 1
                 Log.i('Log watch triggered, will send email')
                 msg = 'The following line matched a trigger:\n\n{}\n\nMatches since last email attempt: {}\n\nNo more matches will be reported for {} minutes'.format(
                     line, self.matchCountSinceLastEmail,
                     self.maxEmailReccurenceMinutes)
                 Thread(target=self.email_maybe,
                        args=(emailHeader, msg)).start()
Ejemplo n.º 11
0
	def process(self, epoch, df):
		if df.empty:
			Log.d('skipping processing of empty dataset')
			return
		r_index = df.index.get_loc(epoch)
		if self.predictor is not None:	
			row_frame = df[r_index:r_index + 1]
			return self.__predict(row_frame)
		not_enough_predictor_data = r_index +1 < self.min_predict_generator_size 
		if not_enough_predictor_data:
			return
		Log.d('initiating predictor contruction at index {}, frame length {}', r_index, len(df))
		predictor = self.predictor_from_config_maybe()
		if predictor is not None:
			self.predictor = predictor
			Log.i('existing predictor endpoint loaded: {}', predictor.endpoint)
			return
		train_df = df[:r_index +1]
		Log.i('at index {}, detected data of adequate length {} writing csv', r_index, len(train_df), self.csv_filepath)
		self.write_csv(train_df)
		return None
Ejemplo n.º 12
0
	def __init__(self, h5_filepath, version):
		warnings.simplefilter('ignore', NaturalNameWarning)
		h5_inputfile = Path(h5_filepath)
		output_dirpath = AppConfig.setting('PREDICTOR_DATA_DIRPATH')
		self.h5_out_filepath = os.path.join(output_dirpath, h5_inputfile.name)
		h5_out_file =  Path(self.h5_out_filepath)
		if h5_out_file.exists():
			Log.i('overwrite file?: {}', h5_out_file)
			if not OsExpert.prompt_confirm('File already exists, overwrite? {}'.format(h5_out_file)):
				Log.d('user aborted, exiting')
				exit()
			Log.w('removing file: {}', h5_out_file)
			os.remove(self.h5_out_filepath)
		self.predictors_map = {}
		base_filepath = output_dirpath
		with pd.HDFStore(h5_filepath, mode='r') as h5: 	
			keys = h5.keys()
			Log.i('h5 input keys: {}', keys)
			assert len(keys) == 1, 'harcoded restriction on single key was violated'
			for key in keys:
				Log.i('row count for {}: {}', key, h5.get_storer(key).nrows)
				self.predictors_map[key] = [
				EnsemblePredictor(min_predict_generator_size=2000, max_train_size=5000)
				]		
		self.h5_watcher = H5FileWatcher(h5_filepath, self.handle_job_epoch, {'is_simulated': 0})
Ejemplo n.º 13
0
 def process_nonparsed_api_responses_subset(self, next_min_id=0):
     limit = 1000
     Log.i(
         'processing nonparsed api responses, starting from id {} with limit {}',
         next_min_id, limit)
     total_row_count = 0
     parse_count = 0
     is_to_keep_fetching = True
     while is_to_keep_fetching == True:
         datasources_frame = self.store.datasources_frame()
         frame = self.store.unparsed_datafetch_api_responses_frame(
             min_id=next_min_id, limit=1000)
         row_count = frame.shape[0]
         if row_count == 0:
             is_to_keep_fetching = False
         else:
             total_row_count += row_count
             for i, row in frame.iterrows():
                 try:
                     row_id = row['id']
                     datasource_id = row['datasource_id']
                     parser = self.find_parser(datasource_id,
                                               datasources_frame)
                     if ParseUtil.parse_and_persist_as_transaction_maybe(
                             row, parser, self.store) == True:
                         parse_count += 1
                 except Exception as e:
                     raise Exception(
                         'Failed to parse row index {} with id {}'.format(
                             i, row_id)) from e
             ids = frame['id']
             max_id = ids.max()
             Log.t('sweep of ids {}..{} returned {} entries', next_min_id,
                   max_id, row_count)
             next_min_id = max_id + 1  # start from the next row
     Log.i('search for nonparsed responses done, parse count: {}/{}',
           parse_count, total_row_count)
     return next_min_id
Ejemplo n.º 14
0
	def write_csv(self, df):
		if self.write_count > 0:
			Log.w('ignoring csv write because it has already been performed')
			return
		X_all, y_all = self.frame_to_ml_inputs(df, do_filter=True)
		assert len(X_all) == len(y_all)
		if X_all.empty:
			Log.w('no rows to write!')
			return
		y_null_count = y_all.isnull().sum()
		assert y_null_count == 0, 'null count: {}'.format(y_null_count)
		X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=123)
		Log.d('X train shape: {}, X test shape: {}', X_train.shape, X_test.shape)
		train = pd.concat([X_train, y_train], axis=1)
		test = pd.concat([X_test, y_test], axis=1)
		is_first_write = (self.write_count == 0)
		for frame, filepath in ((train, self.train_filepath), (test, self.test_filepath)):
			Log.d('writing csv: {}', filepath)
			frame.to_csv(filepath, sep=',', na_rep='', index=False, header=is_first_write, decimal='.', mode='+a')#, index=False)
		with open(self.meta_filepath, 'w') as f:
			f.write(json.dumps(
				{
					'train_filename': Path(self.train_filepath).name,
					'test_filename': Path(self.test_filepath).name,
					'train_observation_count': len(X_train),
					'test_observation_count': len(X_test),
					'feature_count': X_all.shape[1]
				},
				indent=4#, sort_keys=True
				))
		self.write_count += 1
		Log.i('done writing csv file, write count is now: {}', self.write_count)
		if self.is_train_async is True:
			Log.d('propagating notification that csv has been written')
			self.csv_changed_event.set()
		else:
			self.create_predictor_from_csv()
Ejemplo n.º 15
0
				'price': price,
				'currency_id': currency_id,
				'epoch_time': epochTime,
			}
			try:
				db.create_transaction(transaction)
				insertCount += 1
			except DuplicateInsertException as e:
				duplicateCount += 1
	os.remove(filepath)
	Log.i('Done processing, insert count: {}, duplicate count: {}', insertCount, duplicateCount)
db = DatabaseGateway()
currencies = db.currencies_frame()
datasources = db.datasources_frame()
for i, job in enumerate(jobs):
	url = job['url']
	Log.i('Processing job {}/{}'.format(i + 1, len(jobs)))
	start_time = datetime.datetime.now()
	datasource_id = db.datasource_id_by_name(job['datasource_name'])
	exchange_id = db.exchange_id_by_name(job['provider_name'])
	currency_code = db.currency_id_by_code(job['currency_code'])
	retrieve(
		db, 
		url,
		datasource_id,
		exchange_id,
		currency_code
		)	
	time_spent = datetime.datetime.now() - start_time
	Log.i('Done with job, time spent: {}', time_spent)
Ejemplo n.º 16
0
	def run(self, alert_interval_seconds):	
			Log.i('Check interval is: {} seconds', alert_interval_seconds)
			loop = asyncio.get_event_loop()
			loop.run_until_complete(
				self.alert_continuously(alert_interval_seconds)
				)
Ejemplo n.º 17
0
 def initialize(filepath):
     AppConfig.__ensure_config_filepath_valid(filepath)
     AppConfig.Filepath = filepath
     startup_message = 'Configuration filepath: {}'.format(
         AppConfig.Filepath)
     Log.i(startup_message)
Ejemplo n.º 18
0
 def handle_file_created(self, filepath):
     filename = os.path.basename(filepath)
     subscriber = self.parse_util.subscriber_by_filename(filename)
     is_parsed = self.parse_util.process_api_response_file(
         filepath, subscriber)
     Log.i('file {} was parsed: {}', filepath, is_parsed)
Ejemplo n.º 19
0
	def __create_predictor(self, df):
		Log.i('creating predictor on {} rows', len(df))
		assert not df.empty
		kfold = StratifiedKFold(n_splits=10)
		random_state = 2
		classifiers = []
		classifiers.append(SVC(random_state=random_state))
		classifiers.append(DecisionTreeClassifier(random_state=random_state))
		classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
		classifiers.append(RandomForestClassifier(random_state=random_state))
		classifiers.append(ExtraTreesClassifier(random_state=random_state))
		classifiers.append(GradientBoostingClassifier(random_state=random_state))
		classifiers.append(MLPClassifier(random_state=random_state))
		classifiers.append(KNeighborsClassifier())
		classifiers.append(LogisticRegression(random_state = random_state))
		classifiers.append(LinearDiscriminantAnalysis())
		X_all, y_all = self.frame_to_ml_inputs(df, do_filter=True, max_train_size=self.max_train_size)
		if X_all.empty:
			Log.w('could not create predictor as the preprocessing resulted in an empty dataframe')
			return
		X_train, X_test, Y_train, Y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=random_state)
		Log.d('train shape: X: {}, y: {}', X_train.shape, Y_train.shape)
		cv_results = []
		for classifier in classifiers :
			Log.d('performing cross val score for predictor {}', classifier)
			start_time = datetime.now()
			cv_results.append(
				cross_val_score(classifier, X_train, y = Y_train, scoring = 'accuracy', cv = kfold, n_jobs=core_count)
			)
			Log.d('..done, time spent: {}', datetime.now() - start_time)
		cv_means = []
		cv_std = []
		for cv_result in cv_results:
			cv_means.append(cv_result.mean())
			cv_std.append(cv_result.std())
		cv_res = pd.DataFrame({
			'CrossValMeans': cv_means,
			'CrossValerrors': cv_std,
			'Algorithm': [
				'SVC',
				'DecisionTree',
				'AdaBoost',
				'RandomForest',
				'ExtraTrees',
				'GradientBoosting',
				'MultipleLayerPerceptron',
				'KNeighboors',
				'LogisticRegression',
				'LinearDiscriminantAnalysis'
				]})
		Log.d('cross val results:\n{}', cv_res)
		g = sns.barplot('CrossValMeans','Algorithm',data = cv_res, palette='Set3',orient = 'h',**{'xerr':cv_std})
		g.set_xlabel('Mean Accuracy')
		g = g.set_title('Cross validation scores')
		Log.i('saving plot..')
		plt.savefig('!eb1_cross_val_score.png', edgecolor='none', format="png") 
		DTC = DecisionTreeClassifier()
		adaDTC = AdaBoostClassifier(DTC, random_state=7)
		ada_param_grid = {'base_estimator__criterion' : ['gini', 'entropy'],
					  'base_estimator__splitter' :   ['best', 'random'],
					  'algorithm' : ['SAMME','SAMME.R'],
					  'n_estimators' :[1,2],
					  'learning_rate':  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}
		gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsadaDTC.fit(X_train,Y_train)
		ada_best = gsadaDTC.best_estimator_
		gsadaDTC.best_score_
		ExtC = ExtraTreesClassifier()
		ex_param_grid = {'max_depth': [None],
					  'max_features': [1, 3, 10],
					  'min_samples_split': [2, 3, 10],
					  'min_samples_leaf': [1, 3, 10],
					  'bootstrap': [False],
					  'n_estimators' :[100,300],
					  'criterion': ['gini']}
		gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsExtC.fit(X_train,Y_train)
		ExtC_best = gsExtC.best_estimator_
		Log.d('gsExtC.best_score_: {}', gsExtC.best_score_)
		RFC = RandomForestClassifier()
		rf_param_grid = {'max_depth': [None],
					  'max_features': [1, 3, 10],
					  'min_samples_split': [2, 3, 10],
					  'min_samples_leaf': [1, 3, 10],
					  'bootstrap': [False],
					  'n_estimators' :[100,300],
					  'criterion': ['gini']}
		gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsRFC.fit(X_train,Y_train)
		RFC_best = gsRFC.best_estimator_
		Log.d('gsRFC.best_score_: {}', gsRFC.best_score_)
		GBC = GradientBoostingClassifier()
		gb_param_grid = {'loss' : ['deviance'],
					  'n_estimators' : [100,200,300],
					  'learning_rate': [0.1, 0.05, 0.01],
					  'max_depth': [4, 8],
					  'min_samples_leaf': [100,150],
					  'max_features': [0.3, 0.1] 
					  }
		gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsGBC.fit(X_train,Y_train)
		GBC_best = gsGBC.best_estimator_
		Log.d('gsGBC.best_score_: {}', gsGBC.best_score_)
		SVMC = SVC(probability=True)
		svc_param_grid = {'kernel': ['rbf'], 
						  'gamma': [ 0.001, 0.01, 0.1, 1],
						  'C': [1, 10, 50, 100,200,300, 1000]}
		gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsSVMC.fit(X_train,Y_train)
		SVMC_best = gsSVMC.best_estimator_
		Log.d('gsSVMC.best_score_: {}', gsSVMC.best_score_)
		Log.w('quitting')
		exit()
Ejemplo n.º 20
0
    def process_transaction_subset(self, transaction_min_timestamp, set_size,
                                   hdf5_filepath, job_changed_handler,
                                   is_realtime):
        assert job_changed_handler is not None, 'no job_changed_handler provided'
        window_size = 10
        subset_process_start_time = time.time()
        frame = self.db.transaction_by_timestamp_frame(
            transaction_min_timestamp, set_size, self.from_currency_ids,
            self.to_currency_ids)
        frame.set_index('epoch_time', inplace=True)
        row_count = frame.shape[0]
        Log.d('...time spent fetching subset ({} rows) from db: {:.2f}s',
              row_count,
              time.time() - subset_process_start_time)
        if row_count == 0:
            return None
        row_process_count = 0
        last_epoch_time = None
        Log.d('...processing rows...')
        row_process_start_time = time.time()
        gap_resolver = self.run_config['gap_resolver']
        for epoch_time, row in frame.iterrows():
            is_row_processed = False
            try:
                transaction_id = row['id']
                datasource_id = row['datasource_id']
                exchange_id = row['exchange_id']
                from_currency_id = row['from_currency_id']
                to_currency_id = row['to_currency_id']
                price = np.float64(row['price'])
                volume = np.float64(row['volume'])
                transaction_min_timestamp = epoch_time  #transaction_id + 1
                seconds_since_previous = 0 if last_epoch_time is None else epoch_time - last_epoch_time
                Log.t('seconds since previous epoch time: {}',
                      seconds_since_previous)
                if last_epoch_time is not None:
                    assert epoch_time >= last_epoch_time, 'epoch time ({}) was less than the previous epoch time ({})'.format(
                        epoch_time, last_epoch_time)

                seconds_since_previous = 0 if last_epoch_time is None else epoch_time - last_epoch_time
                assert seconds_since_previous >= 0, 'seconds_since_previous cannot be a negative value'
                last_epoch_time = epoch_time
                for job in self.jobs:
                    if (job.datasource.id == datasource_id
                            and job.exchange.id == exchange_id
                            and job.from_currency.id == from_currency_id
                            and job.to_currency.id == to_currency_id):
                        is_row_processed = True
                        try:
                            h5frame = job.frame
                            if h5frame is not None:  # perfrom integrity check on existing =  non-empty dataframe
                                assert not h5frame.empty  # should not be possible if the frame has previously been created
                                last_epoch = h5frame.index.values[-1]
                                seconds_since_previous = epoch_time - last_epoch
                                assert seconds_since_previous >= 0
                                max_gap_seconds = 120  # TODO make config setting
                                if (seconds_since_previous > max_gap_seconds
                                    ):  # TODO make config setting
                                    warn_message = 'excessive time (+{}s) passed since previous observation: {}s ({}) between {} ({}) and {} ({})'.format(
                                        max_gap_seconds,
                                        seconds_since_previous,
                                        Timespan.from_seconds(
                                            int(seconds_since_previous)
                                        ).as_string(), last_epoch,
                                        StringExpert.format_timestamp(
                                            last_epoch), epoch_time,
                                        StringExpert.format_timestamp(
                                            epoch_time))
                                    if gap_resolver is None:
                                        raise Exception(warn_message)
                                    Log.w(warn_message)
                                    prev_observation = h5frame.iloc[-1]
                                    df_intermediates = gap_resolver.intermediates_frame(
                                        max_gap_seconds,
                                        from_epoch=last_epoch,
                                        to_epoch=epoch_time,
                                        from_price=prev_observation['latest'],
                                        to_price=price,
                                        from_volume=prev_observation['volume'],
                                        to_volume=volume)
                                    Log.d(
                                        'simulating intermediate observations:\n{}',
                                        df_intermediates)
                                    simulated_count = 0
                                    for intermediate_epoch, intermediate in df_intermediates.iterrows(
                                    ):
                                        job_observation = job.job_observe(
                                            value=intermediate['price'],
                                            epoch_time=intermediate_epoch,
                                            volume=intermediate['volume'],
                                            is_simulated=True,
                                            is_realtime=False)
                                        assert job_observation is not None
                                        simulated_count += 1
                                        if simulated_count % 1000 == 0:
                                            Log.d('..simulated {}/{}..',
                                                  simulated_count,
                                                  len(df_intermediates))
                                    Log.i(
                                        'done simulating {} observations up until epoch {} ({})',
                                        len(df_intermediates), epoch_time,
                                        StringExpert.format_timestamp(
                                            epoch_time))
                            try:
                                job_observation = job.job_observe(
                                    value=price,
                                    epoch_time=epoch_time,
                                    volume=volume,
                                    is_simulated=False,
                                    is_realtime=is_realtime)
                                row = job_observation  # job_observation_to_frame_row(volume, job_observation)
                                assert row is not None
                                job_changed_handler(job)
                            except DoubleObservationError as doe:
                                Log.w(
                                    'epoch already in frame, will be ignored ({})',
                                    epoch_time)
                        except Exception as job_e:
                            raise Exception(
                                'Failed to feed row to job') from job_e
            except Exception as e:
                raise Exception(
                    'Failed to process row index {}'.format(epoch_time)) from e
            if is_row_processed:
                row_process_count += 1
        Log.d('...time spent processing {}/{} rows in time: {:.2f}s',
              row_process_count, frame.shape[0],
              time.time() - row_process_start_time)
        with pd.HDFStore(hdf5_filepath, mode='a') as h5:
            h5_process_start_time = time.time()
            start_observation_epoch = frame.index.values[0]
            for job in self.jobs:
                df_to_append = job.frame[
                    job.frame.index >= start_observation_epoch]
                try:
                    h5.append(job.uid,
                              df_to_append,
                              format='table',
                              data_columns=True)
                    row_count = h5.get_storer(job.uid).nrows
                    Log.d('...h5 key {}, row count is {}', job.uid, row_count)
                except Exception as append_error:
                    raise append_error
        Log.d('...time spent adding to h5: {:.2f}s',
              time.time() - h5_process_start_time)
        row_processing_time = time.time() - subset_process_start_time
        Log.d('...total time spent on subset: {:.2f}s ({:.2f}s per row)',
              row_processing_time, row_processing_time / row_process_count)
        return transaction_min_timestamp