def __init__(self): #logging.info('Beginning initialization of distributed, streaming anomaly detection server') begin_time = time.time() # rrcf classifier parameters # TODO: tune forest parameters self.TREE_SIZE = 50 self.NUM_TREES = 100 training_data_dir = 'training_data' # Simon model parameters self.maxlen = 200 self.max_cells = 100 checkpoint_dir = 'deployed_checkpoints/' # instantiate Simon feature model config = Simon({}).load_config(MODEL_OBJECT,checkpoint_dir) self.encoder = config['encoder'] Classifier = Simon(encoder=self.encoder) self.model = Classifier.generate_feature_model(self.maxlen, self.max_cells, len(self.encoder.categories), checkpoint_dir, config) self.model._make_predict_function() # dictionary to store separate models by account self.classifiers = {}
def traverse_files_simon(datapath): logging.debug( f'Parsing historical emails as text from raw json files...\n') accounts_to_emails = {} accounts_to_times = {} maxlen = 200 max_cells = 100 for path, _, files in os.walk(datapath): for file in files: if re.match(".*.jsonl$", file): fullpath = os.path.join(path, file) df, accounts_to_times = parse_emails_simon(accounts_to_times, datapath=fullpath) raw_data = np.asarray(df.ix[:max_cells - 1, :]) raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # produce simon feature vector print(f'producing Simon feature vectors for {fullpath}') checkpoint_dir = "../../NK-email-classifier/deployed_checkpoints/" config = Simon({}).load_config('text-class.10-0.42.pkl', checkpoint_dir) X = np.ones((raw_data.shape[0], max_cells, maxlen), dtype=np.int64) * -1 encoder = config['encoder'] Classifier = Simon(encoder=encoder) model = Classifier.generate_feature_model( maxlen, max_cells, len(encoder.categories), checkpoint_dir, config) accounts_to_emails[file] = model.predict(X) return accounts_to_emails, accounts_to_times
def main(datapath, email_index, execution_config, DEBUG): # set important parameters maxlen = 20 max_cells = 500 checkpoint_dir = "pretrained_models/" with open(checkpoint_dir + 'Categories_base.txt', 'r') as f: Categories = f.read().splitlines() category_count = len(Categories) # load specified execution configuration if execution_config is None: raise TypeError Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] intermediate_model = Classifier.generate_feature_model(maxlen, max_cells, category_count, checkpoint_dir, config, DEBUG=DEBUG) # load sample email with open(datapath) as data_file: emails = data_file.readlines() sample_email = json.loads(emails[int(email_index)])['body'] if DEBUG: print('DEBUG::sample email:') print(sample_email) tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sample_email_sentence = tokenizer.tokenize(sample_email) sample_email_sentence = [elem[-maxlen:] for elem in sample_email_sentence] # truncate all_email_df = pd.DataFrame(sample_email_sentence, columns=['Email 0']) if DEBUG: print('DEBUG::the final shape is:') print(all_email_df.shape) all_email_df = all_email_df.astype(str) raw_data = np.asarray(all_email_df.ix[:max_cells - 1, :]) #truncate to max_cells raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # encode data X = encoder.x_encode(raw_data, maxlen) # generate features for email y = intermediate_model.predict(X) # discard empty column edge case y[np.all(all_email_df.isnull(), axis=0)] = 0 # print and return result print('\n128-d Simon Feature Vector:\n') print(y[0]) return y[0]
def main(checkpoint, data_count, data_cols, should_train, nb_epoch, null_pct, try_reuse_data, batch_size, execution_config): maxlen = 20 max_cells = 500 p_threshold = 0.5 checkpoint_dir = "pretrained_models/" if not os.path.isdir(checkpoint_dir): os.makedirs(checkpoint_dir) with open('Categories.txt', 'r') as f: Categories = f.read().splitlines() # orient the user a bit print("fixed categories are: ") Categories = sorted(Categories) print(Categories) raw_data, header = DataGenerator.gen_test_data((data_count, data_cols), try_reuse_data) print(raw_data) # transpose the data raw_data = np.char.lower(np.transpose(raw_data).astype('U')) # do other processing and encode the data if null_pct > 0: DataGenerator.add_nulls_uniform(raw_data, null_pct) config = {} if not should_train: if execution_config is None: raise TypeError config = Simon({}).load_config(execution_config, checkpoint_dir) encoder = config['encoder'] if checkpoint is None: checkpoint = config['checkpoint'] else: encoder = Encoder(categories=Categories) encoder.process(raw_data, max_cells) # encode the data X, y = encoder.encode_data(raw_data, header, maxlen) max_cells = encoder.cur_max_cells Classifier = Simon(encoder=encoder) data = None if should_train: data = Classifier.setup_test_sets(X, y) else: data = type('data_type', (object, ), {'X_test': X, 'y_test': y}) print('Sample chars in X:{}'.format(X[2, 0:10])) print('y:{}'.format(y[2])) # need to know number of fixed categories to create model category_count = y.shape[1] print('Number of fixed categories is :') print(category_count) model = Classifier.generate_model(maxlen, max_cells, category_count) Classifier.load_weights(checkpoint, config, model, checkpoint_dir) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) if (should_train): start = time.time() history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data) end = time.time() print("Time for training is %f sec" % (end - start)) config = { 'encoder': encoder, 'checkpoint': Classifier.get_best_checkpoint(checkpoint_dir) } Classifier.save_config(config, checkpoint_dir) Classifier.plot_loss(history) #comment out on docker images... pred_headers = Classifier.evaluate_model(max_cells, model, data, encoder, p_threshold) print("DEBUG::The predicted headers are:") print(pred_headers) print("DEBUG::The actual headers are:") print(header)
SpamEmails = DataLengthStandardizerRaw(SpamEmails, max_cells) #print("Ten Spam emails after Processing (in DataFrame form) are:") #print((SpamEmails[:10])) print("Spam email dataframe after Processing shape:") print(SpamEmails.shape) # orient the user a bit with open('pretrained_models/Categories.txt', 'r') as f: Categories = f.read().splitlines() print("former categories are: ") Categories = sorted(Categories) print(Categories) category_count_prior = len(Categories) # Load pretrained model via specified execution configuration Classifier = Simon(encoder={}) # dummy text classifier config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config['encoder'] checkpoint = config['checkpoint'] # Encode labels and data Categories = ['spam', 'notspam'] category_count = len(Categories) encoder.categories = Categories header = ([[ 'spam', ]] * Nsamp) header.extend(([[ 'notspam', ]] * Nsamp))
def __init__(self): logging.info('Beginning initialization of distributed, streaming anomaly detection server') begin_time = time.time() # rrcf classifier parameters # TODO: tune forest parameters self.TREE_SIZE = 50 self.NUM_TREES = 100 training_data_dir = 'training_data' # Simon model parameters self.maxlen = 200 self.max_cells = 100 checkpoint_dir = 'deployed_checkpoints/' # instantiate Simon feature model config = Simon({}).load_config(MODEL_OBJECT,checkpoint_dir) self.encoder = config['encoder'] Classifier = Simon(encoder=self.encoder) self.model = Classifier.generate_feature_model(self.maxlen, self.max_cells, len(self.encoder.categories), checkpoint_dir, config) # check if training data exists if len(os.listdir('training_data')) == 0: return # training data folder contains pickled dictionary linking account id to training data else: # initialize separate rrcf classifier object for each sequence in configuration file #self.classifiers = traverse_training_data(training_data_dir, self.model, self.encoder, # maxlen=self.maxlen, max_cells=self.max_cells, checkpoint_dir=checkpoint_dir) # pickle parsed emails for testing #pickle.dump( self.classifiers, open( "classifiers.pkl", "wb" ) ) # #load parsed emails self.classifiers = pickle.load( open( "classifiers.pkl", "rb" ) ) for account, train in self.classifiers.items(): # generate higher d time features for training sets # only include weekly time information if span of training set is longer weekly_bool = check_timestamp_range(train[0]) time_feature_list = np.array([parse_time_features(t, weekly_bool) for t in train[0]]) repeated_time_feature_list = np.repeat(time_feature_list, int(len(train[1][0]) / time_feature_list.shape[1]), axis=1) # concatenate with text features features = np.concatenate((repeated_time_feature_list, train[1]), axis = 1) # train separate rrcf classifier given training data in each sequence start_time = time.time() tree_size = self.TREE_SIZE if features.shape[0] >= self.TREE_SIZE else features.shape[0] self.classifiers[account] = [robust_rcf(self.NUM_TREES, tree_size), weekly_bool] self.classifiers[account][0].fit_batch(features) logging.info(f"Time to train account {account} classifier: {time.time()-start_time}") # record max anomaly score from training set -> generate threshold for prediction ## TODO: tune anomaly threshold threshold = ANOMALY_THRESHOLD * self.classifiers[account][0].batch_anomaly_scores().values.max() self.classifiers[account].append(threshold) self.model._make_predict_function() logging.info(f'Completed initialization of distributed, streaming anomaly detection server. Total time = {(time.time() - begin_time) / 60} mins')
def _produce_annotations(self, inputs: Inputs) -> Outputs: """ generates dataframe with semantic type classifications and classification probabilities for each column of original dataframe Arguments: inputs {Inputs} -- D3M dataframe Returns: Outputs -- dataframe with two columns: "semantic type classifications" and "probabilities" Each row represents a column in the original dataframe. The column "semantic type classifications" contains a list of all semantic type labels and the column "probabilities" contains a list of the model's confidence in assigning each respective semantic type label """ # load model checkpoint checkpoint_dir = (self._volumes["simon_models_1"] + "/simon_models_1/pretrained_models/") if self.hyperparams["statistical_classification"]: execution_config = "Base.pkl" category_list = "/Categories.txt" else: execution_config = "Base_stat_geo.pkl" category_list = "/Categories_base_stat_geo.txt" with open( self._volumes["simon_models_1"] + "/simon_models_1" + category_list, "r") as f: Categories = f.read().splitlines() # create model object Classifier = Simon(encoder={}) config = Classifier.load_config(execution_config, checkpoint_dir) encoder = config["encoder"] checkpoint = config["checkpoint"] model = Classifier.generate_model(20, self.hyperparams["max_rows"], len(Categories)) Classifier.load_weights(checkpoint, None, model, checkpoint_dir) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"]) # prepare data and make predictions frame = inputs.copy() prepped_data = encoder.encodeDataFrame(frame) preds = model.predict_on_batch(tf.constant(prepped_data)) logger.debug('------------Reverse label encoding------------') decoded_preds = encoder.reverse_label_encode( preds, self.hyperparams["p_threshold"]) # apply statistical / ordinal classification if desired if self.hyperparams["statistical_classification"]: logger.debug( "Beginning Guessing categorical/ordinal classifications...") raw_data = frame.values guesses = [ guess(raw_data[:, i], for_types="category") for i in np.arange(raw_data.shape[1]) ] # probability of rule-based statistical / ordinal classifications = min probability of existing classifications for i, g in enumerate(guesses): if g[0] == "category": if len(decoded_preds[1][i]) == 0: guess_prob = self.hyperparams['p_threshold'] else: guess_prob = min(decoded_preds[1][i]) decoded_preds[0][i] += ("categorical", ) decoded_preds[1][i].append(guess_prob) if (("int" in decoded_preds[1][i]) or ("float" in decoded_preds[1][i]) or ("datetime" in decoded_preds[1][i])): decoded_preds[0][i] += ("ordinal", ) decoded_preds[1][i].append(guess_prob) logger.debug("Done with statistical variable guessing") # clear tf session, remove unnecessary files Classifier.clear_session() os.remove('unencoded_chars.json') out_df = pd.DataFrame.from_records(list(decoded_preds)).T out_df.columns = ["semantic types", "probabilities"] return out_df
#print(header) raw_data = np.column_stack((SpamEmails, EnronEmails)).T print("DEBUG::raw_data:") print(raw_data) encoder.process(raw_data, max_cells) X, y = encoder.encode_data(raw_data, header, maxlen) print("DEBUG::X") print(X) print("DEBUG::y") print(y) Classifier = Simon(encoder=encoder) data = Classifier.setup_test_sets(X, y) category_count = y.shape[1] model = Classifier.generate_model(maxlen, max_cells, category_count) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy']) start = time.time() history = Classifier.train_model(batch_size, checkpoint_dir, model, nb_epoch, data) end = time.time() print("Time for training is %f sec" % (end - start)) config = { 'encoder': encoder, 'checkpoint': Classifier.get_best_checkpoint(checkpoint_dir) }