Ejemplo n.º 1
0
	async def check_for_alert_match(self):
		urls = [
			'https://twitter.com/CFTC', 
			'https://twitter.com/sec_enforcement?lang=en',
			'https://twitter.com/ushouserep?lang=en'
			]
		strip_texts = None
		with open('ignore-lines.json', 'r') as f:    
		    strip_texts = json.load(f)			    	    
		Log.d('checking {} sources, ignoring {} lines..', len(urls), len(strip_texts))
		patterns = [
			r'.{,200}bitcoin.{,200}', 
			r'.{,200}crypto.{,200}', 
			r'.{,200}virtual currency.{,200}',
			]
		for url in urls:
			async with aiohttp.ClientSession() as session:
				html_text = await self.__fetch(session, url)
				text = StringExpert.strip_tags(html_text)
				text = html.unescape(text)
				for strip_text in strip_texts:
					text = text.replace(strip_text, '')
				for pattern in patterns:
					match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
					if match is not None:
						matched_line = match.group()
						warning = 'Found pattern "{}" at url "{}" in line: {}'.format(pattern, url, matched_line) 
						Log.w(warning)						
						return True
		return False
Ejemplo n.º 2
0
 def process_h5(self):
     with pd.HDFStore(self.h5_filepath, mode='r') as h5:
         for jobuid in h5:
             is_first_encounter = jobuid not in self.job_frames
             if is_first_encounter == True:
                 self.job_frames[jobuid] = pd.read_hdf(
                     h5, jobuid, start=0,
                     stop=1)  # fetch first row to get the first index/epoch
             job_df = self.job_frames[jobuid]
             latest_epoch = job_df.index.values[
                 -1]  # will ensure we don't 'miss' any rows in case the handle count jumps more than once
             where_clause = 'index > {} {}'.format(latest_epoch,
                                                   self.contraints_clause)
             new_df = pd.read_hdf(h5, jobuid, where=where_clause)
             if new_df.empty:
                 Log.w('dataset was empty for key {} and index > {}',
                       jobuid, latest_epoch)
             else:
                 assert new_df.index.values[0] > latest_epoch
                 new_first_index = 0 if is_first_encounter == True else len(
                     job_df)
                 joined = pd.concat([job_df, new_df])
                 self.job_frames[jobuid] = joined
                 if len(joined) > 100000:
                     Log.w(
                         'holding a dataset of significant length ({} rows, {:.1f}mb): {}',
                         len(joined),
                         joined.memory_usage().sum() / 1_000_000, jobuid)
                 assert joined.shape[0] == len(job_df) + len(new_df)
                 self.ensure_strictly_increasing_index(
                     joined)  # TODO: remove once this is confirmed
                 self.row_handler(jobuid, joined, new_first_index)
Ejemplo n.º 3
0
	def __init__(self, h5_filepath, version):
		warnings.simplefilter('ignore', NaturalNameWarning)
		h5_inputfile = Path(h5_filepath)
		output_dirpath = AppConfig.setting('PREDICTOR_DATA_DIRPATH')
		self.h5_out_filepath = os.path.join(output_dirpath, h5_inputfile.name)
		h5_out_file =  Path(self.h5_out_filepath)
		if h5_out_file.exists():
			Log.i('overwrite file?: {}', h5_out_file)
			if not OsExpert.prompt_confirm('File already exists, overwrite? {}'.format(h5_out_file)):
				Log.d('user aborted, exiting')
				exit()
			Log.w('removing file: {}', h5_out_file)
			os.remove(self.h5_out_filepath)
		self.predictors_map = {}
		base_filepath = output_dirpath
		with pd.HDFStore(h5_filepath, mode='r') as h5: 	
			keys = h5.keys()
			Log.i('h5 input keys: {}', keys)
			assert len(keys) == 1, 'harcoded restriction on single key was violated'
			for key in keys:
				Log.i('row count for {}: {}', key, h5.get_storer(key).nrows)
				self.predictors_map[key] = [
				EnsemblePredictor(min_predict_generator_size=2000, max_train_size=5000)
				]		
		self.h5_watcher = H5FileWatcher(h5_filepath, self.handle_job_epoch, {'is_simulated': 0})
Ejemplo n.º 4
0
	def filter_simulated_observations(self, df):
		filtered_df = df[df['is_simulated'] != 1]
		dropped = df[~df.index.isin(filtered_df.index)]
		if len(dropped) > 0:
			Log.w('filtered out {} simulated frames', len(dropped))
		else:
			Log.d('no simulated frames found to filter out')
		return filtered_df
Ejemplo n.º 5
0
 def feed_jobs_forever(self, job_changed_handler):
     assert job_changed_handler is not None
     sleep_seconds = self.sleep_seconds
     transaction_min_timestamp = self.transaction_min_timestamp
     start_transaction_min_timestamp = transaction_min_timestamp
     data_dirpath = self.data_dirpath
     start_time = time.time()
     Log.i(
         'processing transactions, sleep interval {}s, starting from epoch {} ({})',
         sleep_seconds, transaction_min_timestamp,
         StringExpert.format_timestamp(transaction_min_timestamp))
     to_fetch_count = self.db.transaction_count(transaction_min_timestamp)
     Log.d('transaction count since {} ({}): {}', transaction_min_timestamp,
           StringExpert.format_timestamp(transaction_min_timestamp),
           to_fetch_count)
     pd.set_option('io.hdf.default_format', 'table')
     hdf5_filename = '{}_{}_{}.h5'.format(
         self.version.major, self.version.minor,
         datetime.fromtimestamp(start_time).strftime('%Y%m%d_%H%M%S'))
     hdf5_filepath = path.join(data_dirpath, hdf5_filename)
     Log.i('hdf5 output filepath is: \n{}', hdf5_filepath)
     set_size = 1000
     fetch_count = 0
     plot_time = time.time()
     is_realtime = False
     while True:
         try:
             next_transaction_min_timestamp = self.process_transaction_subset(
                 transaction_min_timestamp, set_size, hdf5_filepath,
                 job_changed_handler, is_realtime)
             if next_transaction_min_timestamp is None:
                 Log.d('nothing to process, waiting..')
                 is_realtime = True  # TODO: empty polling perhaps not the best indicator of switch to realtime
                 time.sleep(sleep_seconds)
             else:
                 assert next_transaction_min_timestamp > transaction_min_timestamp, 'next minimum timestamp was not greater than the current timestamp'
                 transaction_min_timestamp = next_transaction_min_timestamp
                 fetch_count += set_size
                 percentage = 100 * fetch_count / to_fetch_count
                 current_time = time.time()
                 Log.d(
                     'processed {}/{}, {}%, spent {} on the period {} ({}) to {} ({})',
                     fetch_count, to_fetch_count, int(percentage),
                     Timespan.from_seconds(int(current_time -
                                               start_time)).as_string(),
                     StringExpert.format_timestamp(
                         start_transaction_min_timestamp),
                     start_transaction_min_timestamp,
                     StringExpert.format_timestamp(
                         transaction_min_timestamp),
                     transaction_min_timestamp)
         except Exception as e:
             raise Exception(
                 'Failed to process nonparsed api responses') from e
     Log.w('all {} rows read, but should loop forever', row_count)
Ejemplo n.º 6
0
	def __predict(self, df):
		max_prediction_count = 100
		if self.predict_count >= max_prediction_count:
			Log.w('too many predictions {} reached, exiting', self.predict_count)
			exit()
		assert len(df) == 1
		X_all, y_all = self.frame_to_ml_inputs(df)
		predict_row = X_all.iloc[0]
		Log.d('predicting based on {} values:\n{}', len(predict_row.values), predict_row.squeeze().sort_index())
		prediction_response = self.predictor.predict(predict_row.values)
		prediction = self.sagemaker_response_highest_score_label(prediction_response)
		self.predict_count += 1
		return prediction 
Ejemplo n.º 7
0
	def handle_job_epoch(self, jobuid, df, start_index):
		Log.d('handling block starting at index {} for key: {}', start_index, jobuid)
		try:
			df = self.filter_simulated_observations(df)
			if len(df) == 0:	
				Log.w('no rows to process')
				return
			handle_start_time = time.time()
			new_df = df.iloc[start_index:]#.copy()
			if new_df.empty:
				Log.w('nothing to process (zero rows) starting from index {}', start_index)
				return
			indices = new_df.index.values #list(range(start_index, len(df)))
			index_count = len(new_df)
			predictors = self.predictors_map[jobuid]
			Log.d('feeding indices, epochs [{}..{}] ({} rows) to {} predictors for key: {}', indices[0], indices[-1], len(indices), len(predictors), jobuid)
			processed_count = 0
			prediction_count = 0
			for epoch, row in new_df.iterrows():
				for predictor in predictors:
					try:
						prediction = predictor.process(epoch, df)
						if prediction is not None:
							col_name = predictor.prefix
							new_df.at[epoch, col_name] = prediction
							prediction_count += 1
							self.print_acc(new_df)
							Log.d('prediction: {}', prediction)
							Log.d('prediction {} received, sleeping..', prediction_count)
							time.sleep(3)
					except Exception as predict_error:
						raise Exception('Failed to feed epoch {} to predictor {}'.format(epoch, type(predictor).__name__))
				processed_count += 1
				if processed_count % int(index_count/10) == 0:
					percentage_processed = 100 * processed_count / index_count
					Log.d('..processed {}/{} {:.1f}%', processed_count, index_count, percentage_processed)
			Log.d('fed predictors on {} new rows (now {} in total) in {:.1f}s', index_count, len(df), time.time() - handle_start_time) 
			try:	
				h5_start_time = time.time()
				with pd.HDFStore(self.h5_out_filepath, mode='a') as h5: 		
					h5.append(jobuid, new_df, format='table', data_columns=True)
					row_count = h5.get_storer(jobuid).nrows
					Log.d('h5 row count is now: {}', row_count)
				Log.d('appended {}/{} rows to hdf5 in {:.1f}s', index_count, len(df), time.time() - h5_start_time) 
			except Exception as h5e:
				raise Exception('Failed to write to hdf file') from h5e
		except Exception as e:
			raise Exception('Failed to handle epoch') from e
		Log.d('finished handling block')
Ejemplo n.º 8
0
 def parse_and_persist_as_transaction_maybe(datafetch_api_response, parser,
                                            db):
     try:
         transaction = ParseUtil.__parse_and_persist_as_transaction(
             datafetch_api_response, parser, db)
     except DuplicateInsertException as e:
         Log.w('db rejected transaction as a duplicate: {}',
               datafetch_api_response)
         return False
     except Exception as e:
         Log.e(
             'Failed to parse and store transaction from api_response: {}',
             datafetch_api_response)
         raise e
     return True
Ejemplo n.º 9
0
 def __run(self):
     Log.d('Watching file: {}', self.h5_filepath)
     thread = FileWatcher(self.h5_filepath,
                          modified=self.handle_change).run_async()
     try:
         while self.handle_event.wait():
             if self.last_handle_count is not None:
                 jump_count = self.handle_count - self.last_handle_count
                 if jump_count > 1:
                     Log.w(
                         'handle count has jumped {} times than once since the last processing',
                         jump_count)
             self.last_handle_count = self.handle_count
             self.process_h5()
             self.handle_event.clear()
     finally:
         Log.w('run loop broken, unwatching file: {}', self.h5_filepath)
         thread.stop()
         thread.join()
Ejemplo n.º 10
0
			def upload_to_s3(channel, filepath, skip_if_name_and_size_matches=False):
				file = Path(filepath)
				"""From SM examples. Like here: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-transfer-learning.ipynb"""
				s3 = boto3.resource('s3')
				key = channel + '/' + file.name
				bucket_ref = s3.Bucket(bucket)
				objs = list(bucket_ref.objects.filter(Prefix=key))
				is_file_already_existing = len(objs) > 0 and objs[0].key == key
				if is_file_already_existing is True:
					if skip_if_name_and_size_matches is True:
						s3_client = boto3.client('s3')
						response = s3_client.head_object(Bucket=bucket, Key=key)
						local_size = file.stat().st_size
						remote_size = response['ContentLength']
						if remote_size == local_size:
							Log.w('skipping upload as s3 key of same size ({:.2f}kb) already exists: {}', local_size/1000, key)
							return
					Log.w('overwriting existing s3 key: {}', key)
				with open(filepath, "rb") as data:
					s3.Bucket(bucket).put_object(Key=key, Body=data)
Ejemplo n.º 11
0
 def __verify_datafetch_apis_write_frequency(self):
     Log.d('watcher check initiating')
     datafetch_apis_frame = self.db.datafetch_api_view_frame()
     if datafetch_apis_frame.empty:
         Log.d('no datafetch apis to watch')
     else:
         exceed_count = 0
         for i, row in datafetch_apis_frame.iterrows():
             datafetch_api_id = row['id']
             write_idle_seconds = row['write_idle_seconds']
             result_frequency_seconds = row['result_frequency_seconds']
             if write_idle_seconds > result_frequency_seconds:
                 exceed_count += 1
                 idle_time_str = Timespan.from_seconds(
                     write_idle_seconds).as_string()
                 warn_message = 'datafetch api id {} has exceeded its {} second limit (idle time {})'.format(
                     datafetch_api_id, result_frequency_seconds,
                     idle_time_str)
                 Log.w(warn_message)
         Log.d('watch check done, exceed count: {}', exceed_count)
Ejemplo n.º 12
0
	def write_csv(self, df):
		if self.write_count > 0:
			Log.w('ignoring csv write because it has already been performed')
			return
		X_all, y_all = self.frame_to_ml_inputs(df, do_filter=True)
		assert len(X_all) == len(y_all)
		if X_all.empty:
			Log.w('no rows to write!')
			return
		y_null_count = y_all.isnull().sum()
		assert y_null_count == 0, 'null count: {}'.format(y_null_count)
		X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=123)
		Log.d('X train shape: {}, X test shape: {}', X_train.shape, X_test.shape)
		train = pd.concat([X_train, y_train], axis=1)
		test = pd.concat([X_test, y_test], axis=1)
		is_first_write = (self.write_count == 0)
		for frame, filepath in ((train, self.train_filepath), (test, self.test_filepath)):
			Log.d('writing csv: {}', filepath)
			frame.to_csv(filepath, sep=',', na_rep='', index=False, header=is_first_write, decimal='.', mode='+a')#, index=False)
		with open(self.meta_filepath, 'w') as f:
			f.write(json.dumps(
				{
					'train_filename': Path(self.train_filepath).name,
					'test_filename': Path(self.test_filepath).name,
					'train_observation_count': len(X_train),
					'test_observation_count': len(X_test),
					'feature_count': X_all.shape[1]
				},
				indent=4#, sort_keys=True
				))
		self.write_count += 1
		Log.i('done writing csv file, write count is now: {}', self.write_count)
		if self.is_train_async is True:
			Log.d('propagating notification that csv has been written')
			self.csv_changed_event.set()
		else:
			self.create_predictor_from_csv()
Ejemplo n.º 13
0
	def __init__(self, csv_filepath, min_row_count, is_train_async, min_predict_generator_size):
		super().__init__(predict_col='feature_rtrspc()_next_trend_pricefeature')
		csv_file = Path(csv_filepath)
		assert csv_file.is_dir(), csv_filepath
		self.csv_filepath = csv_filepath
		self.write_count = 0
		self.predict_count = 0
		self.train_filepath = os.path.join(csv_filepath, 'sagemaker_train.csv')
		self.test_filepath  = os.path.join(csv_filepath, 'sagemaker_test.csv')
		self.meta_filepath  = os.path.join(csv_filepath, 'sagemaker.json')
		if Path(self.train_filepath).exists():
			os.remove(self.train_filepath)
		if Path(self.test_filepath).exists():
			os.remove(self.test_filepath)
		if Path(self.meta_filepath).exists():
			os.remove(self.meta_filepath)
		self.predictor = None
		self.is_train_async = is_train_async
		self.csv_changed_event = Event() if is_train_async else None
		self.min_predict_generator_size = min_predict_generator_size 
		if is_train_async is True:
			Log.w('oops is_train_async, we dont allow yet')
			exit()
			Thread(target=self.csv_write_event_handler).start()
Ejemplo n.º 14
0
    def process_transaction_subset(self, transaction_min_timestamp, set_size,
                                   hdf5_filepath, job_changed_handler,
                                   is_realtime):
        assert job_changed_handler is not None, 'no job_changed_handler provided'
        window_size = 10
        subset_process_start_time = time.time()
        frame = self.db.transaction_by_timestamp_frame(
            transaction_min_timestamp, set_size, self.from_currency_ids,
            self.to_currency_ids)
        frame.set_index('epoch_time', inplace=True)
        row_count = frame.shape[0]
        Log.d('...time spent fetching subset ({} rows) from db: {:.2f}s',
              row_count,
              time.time() - subset_process_start_time)
        if row_count == 0:
            return None
        row_process_count = 0
        last_epoch_time = None
        Log.d('...processing rows...')
        row_process_start_time = time.time()
        gap_resolver = self.run_config['gap_resolver']
        for epoch_time, row in frame.iterrows():
            is_row_processed = False
            try:
                transaction_id = row['id']
                datasource_id = row['datasource_id']
                exchange_id = row['exchange_id']
                from_currency_id = row['from_currency_id']
                to_currency_id = row['to_currency_id']
                price = np.float64(row['price'])
                volume = np.float64(row['volume'])
                transaction_min_timestamp = epoch_time  #transaction_id + 1
                seconds_since_previous = 0 if last_epoch_time is None else epoch_time - last_epoch_time
                Log.t('seconds since previous epoch time: {}',
                      seconds_since_previous)
                if last_epoch_time is not None:
                    assert epoch_time >= last_epoch_time, 'epoch time ({}) was less than the previous epoch time ({})'.format(
                        epoch_time, last_epoch_time)

                seconds_since_previous = 0 if last_epoch_time is None else epoch_time - last_epoch_time
                assert seconds_since_previous >= 0, 'seconds_since_previous cannot be a negative value'
                last_epoch_time = epoch_time
                for job in self.jobs:
                    if (job.datasource.id == datasource_id
                            and job.exchange.id == exchange_id
                            and job.from_currency.id == from_currency_id
                            and job.to_currency.id == to_currency_id):
                        is_row_processed = True
                        try:
                            h5frame = job.frame
                            if h5frame is not None:  # perfrom integrity check on existing =  non-empty dataframe
                                assert not h5frame.empty  # should not be possible if the frame has previously been created
                                last_epoch = h5frame.index.values[-1]
                                seconds_since_previous = epoch_time - last_epoch
                                assert seconds_since_previous >= 0
                                max_gap_seconds = 120  # TODO make config setting
                                if (seconds_since_previous > max_gap_seconds
                                    ):  # TODO make config setting
                                    warn_message = 'excessive time (+{}s) passed since previous observation: {}s ({}) between {} ({}) and {} ({})'.format(
                                        max_gap_seconds,
                                        seconds_since_previous,
                                        Timespan.from_seconds(
                                            int(seconds_since_previous)
                                        ).as_string(), last_epoch,
                                        StringExpert.format_timestamp(
                                            last_epoch), epoch_time,
                                        StringExpert.format_timestamp(
                                            epoch_time))
                                    if gap_resolver is None:
                                        raise Exception(warn_message)
                                    Log.w(warn_message)
                                    prev_observation = h5frame.iloc[-1]
                                    df_intermediates = gap_resolver.intermediates_frame(
                                        max_gap_seconds,
                                        from_epoch=last_epoch,
                                        to_epoch=epoch_time,
                                        from_price=prev_observation['latest'],
                                        to_price=price,
                                        from_volume=prev_observation['volume'],
                                        to_volume=volume)
                                    Log.d(
                                        'simulating intermediate observations:\n{}',
                                        df_intermediates)
                                    simulated_count = 0
                                    for intermediate_epoch, intermediate in df_intermediates.iterrows(
                                    ):
                                        job_observation = job.job_observe(
                                            value=intermediate['price'],
                                            epoch_time=intermediate_epoch,
                                            volume=intermediate['volume'],
                                            is_simulated=True,
                                            is_realtime=False)
                                        assert job_observation is not None
                                        simulated_count += 1
                                        if simulated_count % 1000 == 0:
                                            Log.d('..simulated {}/{}..',
                                                  simulated_count,
                                                  len(df_intermediates))
                                    Log.i(
                                        'done simulating {} observations up until epoch {} ({})',
                                        len(df_intermediates), epoch_time,
                                        StringExpert.format_timestamp(
                                            epoch_time))
                            try:
                                job_observation = job.job_observe(
                                    value=price,
                                    epoch_time=epoch_time,
                                    volume=volume,
                                    is_simulated=False,
                                    is_realtime=is_realtime)
                                row = job_observation  # job_observation_to_frame_row(volume, job_observation)
                                assert row is not None
                                job_changed_handler(job)
                            except DoubleObservationError as doe:
                                Log.w(
                                    'epoch already in frame, will be ignored ({})',
                                    epoch_time)
                        except Exception as job_e:
                            raise Exception(
                                'Failed to feed row to job') from job_e
            except Exception as e:
                raise Exception(
                    'Failed to process row index {}'.format(epoch_time)) from e
            if is_row_processed:
                row_process_count += 1
        Log.d('...time spent processing {}/{} rows in time: {:.2f}s',
              row_process_count, frame.shape[0],
              time.time() - row_process_start_time)
        with pd.HDFStore(hdf5_filepath, mode='a') as h5:
            h5_process_start_time = time.time()
            start_observation_epoch = frame.index.values[0]
            for job in self.jobs:
                df_to_append = job.frame[
                    job.frame.index >= start_observation_epoch]
                try:
                    h5.append(job.uid,
                              df_to_append,
                              format='table',
                              data_columns=True)
                    row_count = h5.get_storer(job.uid).nrows
                    Log.d('...h5 key {}, row count is {}', job.uid, row_count)
                except Exception as append_error:
                    raise append_error
        Log.d('...time spent adding to h5: {:.2f}s',
              time.time() - h5_process_start_time)
        row_processing_time = time.time() - subset_process_start_time
        Log.d('...total time spent on subset: {:.2f}s ({:.2f}s per row)',
              row_processing_time, row_processing_time / row_process_count)
        return transaction_min_timestamp
Ejemplo n.º 15
0
 def frame_to_ml_inputs(self,
                        df,
                        do_filter=False,
                        one_hot=True,
                        max_train_size=None):
     assert self.predict_col in df, 'prediction column "{}"  does not exist in columns {}'.format(
         self.predict_col, df.columns.values)
     predict_col = next(c for c in df.columns.values
                        if c == self.predict_col)
     active_columns = [
         c for c in df.columns.values if c.endswith('_active')
     ]
     if do_filter is True:  #TODO: are empty feature rows filtered out??
         Log.d(
             'filtering out rows with all feature values empty or predictor column empty, if needed'
         )
         df_no_all_empty_feature_values = df.loc[(
             df[active_columns] == 1).all(axis=1)]
         dropped = df[~df.index.isin(df_no_all_empty_feature_values.index)]
         if len(dropped) > 0:
             Log.w('all-cols-empty drop row count: {}', len(dropped))
         df_no_empty_predict = df_no_all_empty_feature_values.dropna(
             subset=[predict_col])  # no point in using empty rows
         dropped = df_no_all_empty_feature_values[
             ~df_no_all_empty_feature_values.index.isin(df_no_empty_predict.
                                                        index)]
         if len(dropped) > 0:
             Log.w('empty-predict-col drop row count: {}', len(dropped))
         df_filtered = df_no_empty_predict
         keep_ratio = len(df_filtered) / len(df)
         Log.d('frame row ratio after filtering: {}/{} = {:.2f}',
               len(df_filtered), len(df), keep_ratio)
         assert keep_ratio < 1
     else:
         df_filtered = df
     if max_train_size is not None:
         count_before = len(df_filtered)
         df_filtered = df_filtered.head(max_train_size)
         drop_count = count_before - len(df_filtered)
         Log.d('row count after train max size trim: {} - {} = {}',
               count_before, drop_count, len(df_filtered))
     float_feature_cols = set(
         c for c in df.columns.values
         if not c.endswith(predict_col) and c.endswith('_feature'))
     category_feature_cols = set()
     for feature, postfix_pattern in feature_postfix_map.items():
         assert issubclass(
             feature,
             IntEnum), 'feature {} is not an int enum'.format(feature)
         cols = [
             c for c in df.columns.values
             if not c.endswith(predict_col) and postfix_pattern.match(c)
         ]
         enum_values = [int(k) for k in feature]
         if one_hot is not True:
             category_feature_cols.update(cols)
         else:
             kwargs = {}
             for col in cols:
                 for enum_value in enum_values:
                     enum_name = feature(enum_value).name
                     enum_value_col = '{}_{}'.format(col, enum_name.lower())
                     kwargs[enum_value_col] = (
                         df_filtered[col] == enum_value).astype(np.float64)
                     category_feature_cols.add(enum_value_col)
             df_filtered = df_filtered.assign(**kwargs)
     feature_cols = list(float_feature_cols | category_feature_cols)
     assert len(feature_cols) > 0
     y_all = df_filtered[predict_col]
     assert not y_all.isnull().values.any(
     ), 'one or more values in the predict series were not specified'
     X_all = df_filtered[feature_cols]
     X_all = X_all.fillna(0)
     assert not X_all.isnull().values.any(
     ), 'one or more values in the input frame were not specified, although they should\' been na-filled with zeros'
     return X_all, y_all
Ejemplo n.º 16
0
	def __create_predictor(self, df):
		Log.i('creating predictor on {} rows', len(df))
		assert not df.empty
		kfold = StratifiedKFold(n_splits=10)
		random_state = 2
		classifiers = []
		classifiers.append(SVC(random_state=random_state))
		classifiers.append(DecisionTreeClassifier(random_state=random_state))
		classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
		classifiers.append(RandomForestClassifier(random_state=random_state))
		classifiers.append(ExtraTreesClassifier(random_state=random_state))
		classifiers.append(GradientBoostingClassifier(random_state=random_state))
		classifiers.append(MLPClassifier(random_state=random_state))
		classifiers.append(KNeighborsClassifier())
		classifiers.append(LogisticRegression(random_state = random_state))
		classifiers.append(LinearDiscriminantAnalysis())
		X_all, y_all = self.frame_to_ml_inputs(df, do_filter=True, max_train_size=self.max_train_size)
		if X_all.empty:
			Log.w('could not create predictor as the preprocessing resulted in an empty dataframe')
			return
		X_train, X_test, Y_train, Y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=random_state)
		Log.d('train shape: X: {}, y: {}', X_train.shape, Y_train.shape)
		cv_results = []
		for classifier in classifiers :
			Log.d('performing cross val score for predictor {}', classifier)
			start_time = datetime.now()
			cv_results.append(
				cross_val_score(classifier, X_train, y = Y_train, scoring = 'accuracy', cv = kfold, n_jobs=core_count)
			)
			Log.d('..done, time spent: {}', datetime.now() - start_time)
		cv_means = []
		cv_std = []
		for cv_result in cv_results:
			cv_means.append(cv_result.mean())
			cv_std.append(cv_result.std())
		cv_res = pd.DataFrame({
			'CrossValMeans': cv_means,
			'CrossValerrors': cv_std,
			'Algorithm': [
				'SVC',
				'DecisionTree',
				'AdaBoost',
				'RandomForest',
				'ExtraTrees',
				'GradientBoosting',
				'MultipleLayerPerceptron',
				'KNeighboors',
				'LogisticRegression',
				'LinearDiscriminantAnalysis'
				]})
		Log.d('cross val results:\n{}', cv_res)
		g = sns.barplot('CrossValMeans','Algorithm',data = cv_res, palette='Set3',orient = 'h',**{'xerr':cv_std})
		g.set_xlabel('Mean Accuracy')
		g = g.set_title('Cross validation scores')
		Log.i('saving plot..')
		plt.savefig('!eb1_cross_val_score.png', edgecolor='none', format="png") 
		DTC = DecisionTreeClassifier()
		adaDTC = AdaBoostClassifier(DTC, random_state=7)
		ada_param_grid = {'base_estimator__criterion' : ['gini', 'entropy'],
					  'base_estimator__splitter' :   ['best', 'random'],
					  'algorithm' : ['SAMME','SAMME.R'],
					  'n_estimators' :[1,2],
					  'learning_rate':  [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1.5]}
		gsadaDTC = GridSearchCV(adaDTC,param_grid = ada_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsadaDTC.fit(X_train,Y_train)
		ada_best = gsadaDTC.best_estimator_
		gsadaDTC.best_score_
		ExtC = ExtraTreesClassifier()
		ex_param_grid = {'max_depth': [None],
					  'max_features': [1, 3, 10],
					  'min_samples_split': [2, 3, 10],
					  'min_samples_leaf': [1, 3, 10],
					  'bootstrap': [False],
					  'n_estimators' :[100,300],
					  'criterion': ['gini']}
		gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsExtC.fit(X_train,Y_train)
		ExtC_best = gsExtC.best_estimator_
		Log.d('gsExtC.best_score_: {}', gsExtC.best_score_)
		RFC = RandomForestClassifier()
		rf_param_grid = {'max_depth': [None],
					  'max_features': [1, 3, 10],
					  'min_samples_split': [2, 3, 10],
					  'min_samples_leaf': [1, 3, 10],
					  'bootstrap': [False],
					  'n_estimators' :[100,300],
					  'criterion': ['gini']}
		gsRFC = GridSearchCV(RFC,param_grid = rf_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsRFC.fit(X_train,Y_train)
		RFC_best = gsRFC.best_estimator_
		Log.d('gsRFC.best_score_: {}', gsRFC.best_score_)
		GBC = GradientBoostingClassifier()
		gb_param_grid = {'loss' : ['deviance'],
					  'n_estimators' : [100,200,300],
					  'learning_rate': [0.1, 0.05, 0.01],
					  'max_depth': [4, 8],
					  'min_samples_leaf': [100,150],
					  'max_features': [0.3, 0.1] 
					  }
		gsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsGBC.fit(X_train,Y_train)
		GBC_best = gsGBC.best_estimator_
		Log.d('gsGBC.best_score_: {}', gsGBC.best_score_)
		SVMC = SVC(probability=True)
		svc_param_grid = {'kernel': ['rbf'], 
						  'gamma': [ 0.001, 0.01, 0.1, 1],
						  'C': [1, 10, 50, 100,200,300, 1000]}
		gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=kfold, scoring='accuracy', n_jobs=core_count, verbose = 1)
		gsSVMC.fit(X_train,Y_train)
		SVMC_best = gsSVMC.best_estimator_
		Log.d('gsSVMC.best_score_: {}', gsSVMC.best_score_)
		Log.w('quitting')
		exit()
Ejemplo n.º 17
0
def h5_to_plot(h5, from_epoch, to_epoch, filterInNth, agents, format_as_image):
    Log.d('============')
    Log.d(agents)
    agent_keys = [a for a in agents.split(',') if a]
    if len(agent_keys) == 0:
        return 'No agent selected'
    filterInNth = int(filterInNth)
    df_info = ''
    pd.options.display.float_format = '{:.2f}'.format
    df_info += 'No agent selected\n\n{}\n\n'.format(h5.info())
    for key in h5:
        where = 'index >= {} and index <= {}'.format(from_epoch, to_epoch)
        Log.d('where: {}', where)
        frame = pd.read_hdf(h5, key, where=where)
        if frame.empty == True:
            return 'Empty frame'
        df_info += '{}\n\n'.format(frame.describe())
        background_color = '#272822'
        minute_intervals = [
            12 * 60,  # 12 hours
        ]
        x = range(100)
        y = [a * 2 + random.randint(-20, 20) for a in x]
        fig, ax = plt.subplots(figsize=(23, 12))  #figsize=(28,21))
        fig.patch.set_facecolor(background_color)
        Log.t('building plot')
        is_image_format = int(format_as_image) == True

        def label_connect(path_collection, labels, color=None):
            tooltip = mpld3.plugins.PointHTMLTooltip(path_collection, [
                '<span class="point-tooltip" style="color: {}">{} <span class="point-tooltip-key">{}<span><span>'
                .format(color, l, key) for l in labels
            ],
                                                     voffset=100,
                                                     hoffset=0)
            mpld3.plugins.connect(fig, tooltip)

        for agent_key in agent_keys:
            try:
                agent_name = agent_key.split('(')[0]
                Log.d('plotting agent: {} -> {}', agent_key, agent_name)
                agent = agent_map[agent_name]
                plot_title = ''
                col_prefix = 'feature_{}_'.format(agent_key)
                agent_plot = agent.plot(plot_title,
                                        None,
                                        frame,
                                        ax,
                                        is_image_format,
                                        label_connect=label_connect,
                                        filter_in_nth=filterInNth,
                                        cp=col_prefix)
                pe.style_plot(ax, plot_title)
            except KeyError as ke:
                Log.w('Valid keys are: {}', frame.keys())
                raise ke
        plot_dirpath = AppConfig.setting('PLOT_DIRPATH')
        plot_filepath = os.path.join(plot_dirpath,
                                     '{}.png'.format('some plot'))

        fig.patch.set_facecolor(style.backgroundColor)
        fig.tight_layout()
        if is_image_format == True:
            sio = BytesIO()
            fig.savefig(sio,
                        facecolor=fig.get_facecolor(),
                        edgecolor='none',
                        format="png")
            html = '<img src="data:image/png;base64,{}"/>'.format(
                base64.encodebytes(sio.getvalue()).decode())
            return html
        mpld3.plugins.connect(fig, ZoomSizePlugin())
        return mpld3.fig_to_html(fig)
    raise 'hmmm'
Ejemplo n.º 18
0
 def job_observe(self, value, epoch_time, volume, is_simulated,
                 is_realtime):
     if self.interval_stat is None:
         self.interval_stat = IntervalStat(self.interval_second, epoch_time)
     if self.frame is None:  # TODO: must to constructor
         self.frame = self.emptyFrame(self.interval_stat)
     istat = self.interval_stat
     if epoch_time in self.frame.index:
         raise DoubleObservationError(epoch_time)
     is_first_value_in_interval = epoch_time >= istat.interval_end_epoch
     if is_first_value_in_interval:
         self.frame.iloc[-1, self.frame.columns.get_loc('is_closing')] = 1
         istat.reset(epoch_time)
     istat.interval_observe(value=value, epoch_time=epoch_time)
     interval_value = istat.as_dict()
     assert volume is not None
     assert isinstance(volume, np.float64)
     self.frame.at[epoch_time] = {
         **interval_value, 'volume':
         volume,
         'age':
         0 if self.frame.empty else epoch_time - self.frame.index.values[0],
         'is_simulated':
         1 if is_simulated is True else 0,
         'is_realtime':
         1 if is_realtime is True else 0
     }
     total_time_spent = 0
     for f in self.features:
         try:
             start_time = time.time()
             feature_observation = f.feature_observe(
                 epoch_time=epoch_time,
                 interval_value=interval_value,
                 history_frame=self.frame,
                 is_first_value_in_interval=is_first_value_in_interval,
                 is_reset_observation=False)
             time_spent = time.time() - start_time
             self.frame.at[epoch_time, f.col_prefix + 'time'] = time_spent
             if feature_observation is not None:
                 assert isinstance(feature_observation,
                                   pd.Series) or isinstance(
                                       feature_observation, dict)
                 kv_pairs = feature_observation.iteritems() if isinstance(
                     feature_observation,
                     pd.Series) else feature_observation.items()
                 for key, value in kv_pairs:
                     if value is None:
                         Log.w(
                             'feature agent returned the None value for job uid {}, feature {} and pair key {}',
                             self.uid, f.NAME, key)
                     elif value is not nan:
                         colname = f.col_prefix + key
                         assert colname in self.frame
                         self.frame.at[epoch_time, colname] = value
             total_time_spent += time_spent
         except Exception as e:
             raise Exception('Failed to feed value to feature "{}"'.format(
                 f.col_prefix))
         self.frame.at[epoch_time, 'time'] = total_time_spent
     if len(self.frame) % 500 == 0:
         self.print_stats()
     return self.frame.loc[epoch_time]