예제 #1
0
    def __init__(self):

        #logging.info('Beginning initialization of distributed, streaming anomaly detection server')
        begin_time = time.time()

        # rrcf classifier parameters
        # TODO: tune forest parameters
        self.TREE_SIZE = 50
        self.NUM_TREES = 100
        training_data_dir = 'training_data'

        # Simon model parameters
        self.maxlen = 200
        self.max_cells = 100
        checkpoint_dir = 'deployed_checkpoints/'

        # instantiate Simon feature model
        config = Simon({}).load_config(MODEL_OBJECT,checkpoint_dir)
        self.encoder = config['encoder']
        Classifier = Simon(encoder=self.encoder)
        self.model = Classifier.generate_feature_model(self.maxlen, self.max_cells, len(self.encoder.categories), checkpoint_dir, config)
        self.model._make_predict_function()

        # dictionary to store separate models by account
        self.classifiers = {}
def traverse_files_simon(datapath):
    logging.debug(
        f'Parsing historical emails as text from raw json files...\n')
    accounts_to_emails = {}
    accounts_to_times = {}
    maxlen = 200
    max_cells = 100
    for path, _, files in os.walk(datapath):
        for file in files:
            if re.match(".*.jsonl$", file):
                fullpath = os.path.join(path, file)
                df, accounts_to_times = parse_emails_simon(accounts_to_times,
                                                           datapath=fullpath)
                raw_data = np.asarray(df.ix[:max_cells - 1, :])
                raw_data = np.char.lower(np.transpose(raw_data).astype('U'))

                # produce simon feature vector
                print(f'producing Simon feature vectors for {fullpath}')
                checkpoint_dir = "../../NK-email-classifier/deployed_checkpoints/"
                config = Simon({}).load_config('text-class.10-0.42.pkl',
                                               checkpoint_dir)
                X = np.ones((raw_data.shape[0], max_cells, maxlen),
                            dtype=np.int64) * -1
                encoder = config['encoder']
                Classifier = Simon(encoder=encoder)
                model = Classifier.generate_feature_model(
                    maxlen, max_cells, len(encoder.categories), checkpoint_dir,
                    config)
                accounts_to_emails[file] = model.predict(X)
    return accounts_to_emails, accounts_to_times
예제 #3
0
def main(datapath, email_index, execution_config, DEBUG):

    # set important parameters
    maxlen = 20
    max_cells = 500
    checkpoint_dir = "pretrained_models/"
    with open(checkpoint_dir + 'Categories_base.txt', 'r') as f:
        Categories = f.read().splitlines()
    category_count = len(Categories)

    # load specified execution configuration
    if execution_config is None:
        raise TypeError
    Classifier = Simon(encoder={})  # dummy text classifier
    config = Classifier.load_config(execution_config, checkpoint_dir)
    encoder = config['encoder']
    intermediate_model = Classifier.generate_feature_model(maxlen,
                                                           max_cells,
                                                           category_count,
                                                           checkpoint_dir,
                                                           config,
                                                           DEBUG=DEBUG)

    # load sample email
    with open(datapath) as data_file:
        emails = data_file.readlines()
    sample_email = json.loads(emails[int(email_index)])['body']
    if DEBUG:
        print('DEBUG::sample email:')
        print(sample_email)
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sample_email_sentence = tokenizer.tokenize(sample_email)
    sample_email_sentence = [elem[-maxlen:]
                             for elem in sample_email_sentence]  # truncate
    all_email_df = pd.DataFrame(sample_email_sentence, columns=['Email 0'])
    if DEBUG:
        print('DEBUG::the final shape is:')
        print(all_email_df.shape)
    all_email_df = all_email_df.astype(str)
    raw_data = np.asarray(all_email_df.ix[:max_cells -
                                          1, :])  #truncate to max_cells
    raw_data = np.char.lower(np.transpose(raw_data).astype('U'))

    # encode data
    X = encoder.x_encode(raw_data, maxlen)

    # generate features for email
    y = intermediate_model.predict(X)
    # discard empty column edge case
    y[np.all(all_email_df.isnull(), axis=0)] = 0

    # print and return result
    print('\n128-d Simon Feature Vector:\n')
    print(y[0])
    return y[0]
    def __init__(self):

        logging.info('Beginning initialization of distributed, streaming anomaly detection server')
        begin_time = time.time()

        # rrcf classifier parameters
        # TODO: tune forest parameters
        self.TREE_SIZE = 50
        self.NUM_TREES = 100
        training_data_dir = 'training_data'

        # Simon model parameters
        self.maxlen = 200
        self.max_cells = 100
        checkpoint_dir = 'deployed_checkpoints/'

        # instantiate Simon feature model
        config = Simon({}).load_config(MODEL_OBJECT,checkpoint_dir)
        self.encoder = config['encoder']
        Classifier = Simon(encoder=self.encoder)
        self.model = Classifier.generate_feature_model(self.maxlen, self.max_cells, len(self.encoder.categories), checkpoint_dir, config)

        # check if training data exists
        if len(os.listdir('training_data')) == 0:
            return
        
        # training data folder contains pickled dictionary linking account id to training data
        else:
            # initialize separate rrcf classifier object for each sequence in configuration file
            #self.classifiers = traverse_training_data(training_data_dir, self.model, self.encoder,
            #                maxlen=self.maxlen, max_cells=self.max_cells, checkpoint_dir=checkpoint_dir)

            # pickle parsed emails for testing
            #pickle.dump( self.classifiers, open( "classifiers.pkl", "wb" ) )

            # #load parsed emails
            self.classifiers = pickle.load( open( "classifiers.pkl", "rb" ) )

            for account, train in self.classifiers.items():

                # generate higher d time features for training sets
                # only include weekly time information if span of training set is longer
                weekly_bool = check_timestamp_range(train[0])
                time_feature_list = np.array([parse_time_features(t, weekly_bool) for t in train[0]])
                repeated_time_feature_list = np.repeat(time_feature_list, int(len(train[1][0]) / time_feature_list.shape[1]), axis=1)

                # concatenate with text features
                features = np.concatenate((repeated_time_feature_list, train[1]), axis = 1)

                # train separate rrcf classifier given training data in each sequence 
                start_time = time.time()
                tree_size = self.TREE_SIZE if features.shape[0] >= self.TREE_SIZE else features.shape[0]
                self.classifiers[account] = [robust_rcf(self.NUM_TREES, tree_size), weekly_bool]
                self.classifiers[account][0].fit_batch(features)
                logging.info(f"Time to train account {account} classifier: {time.time()-start_time}")

                # record max anomaly score from training set -> generate threshold for prediction
                ## TODO: tune anomaly threshold
                threshold = ANOMALY_THRESHOLD * self.classifiers[account][0].batch_anomaly_scores().values.max()
                self.classifiers[account].append(threshold)
        
        self.model._make_predict_function()
        logging.info(f'Completed initialization of distributed, streaming anomaly detection server. Total time = {(time.time() - begin_time) / 60} mins')