def load_log_files(dirs): """ :param dirs: list of directories from where to load log files :return: log_files dataset and labels for MLP, also the list of all logs' names and paths """ data_set = list() labels = list() names = list() paths = list() for iterator in range(0, len(dirs)): files = os.listdir(dirs[iterator]) names.append(files) for index in range(len(files)): paths.append(dirs[iterator]) for file in files: log_file = open(dirs[iterator] + '/' + file, 'r') feature_vector = list() # feature vector for an entire log file lines = log_file.readlines() for line in lines: line_vector = hashing_trick(text=line, n=100000, hash_function=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ') for element in line_vector: feature_vector.append(float(element)) data_set.append(feature_vector) labels.append(iterator + 1) # Make shape of dataset uniform for NN ###################################### min_len = copy(BIG_NUMBER) for array in data_set: min_len = min(min_len, len(array)) for iterator in range(0, len(data_set)): data_set[iterator] = data_set[iterator][:min_len] ###################################### # use min_max scaling scaler = MinMaxScaler() data_set = scaler.fit_transform(np.array(data_set)) names = merge_splits(names) return data_set, np.array(labels), names, paths
def prep_1(text): text = "The quick brown fox jumped over the lazy dog." list_unique_words = list(set(text_to_word_sequence(text))) print(f"docs: {list_unique_words[:100]}") vocab_size = len(list_unique_words) print(f"vocab_size: {vocab_size}") oh_encoding = one_hot(text, n=round(vocab_size * 1.3)) print(f"oh_encoding: {oh_encoding}") hashed_doc = hashing_trick(text, n=round(vocab_size * 1.3), hash_function='md5') print(f"hashed_doc: {hashed_doc}") return oh_encoding
def test_hashing_trick_md5(): sample_text = 'The cat sat on the mat.' encoded = text.hashing_trick(sample_text, 5, hash_function='md5') assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 1
def test_hashing_trick_hash(): sample_text = 'The cat sat on the mat.' encoded = text.hashing_trick(sample_text, 5) assert len(encoded) == 6 assert np.max(encoded) <= 4 assert np.min(encoded) >= 1
def _transform(X: pd.Series) -> List[List[int]]: return [ hashing_trick(text, n=self._hash_slots) for i, text in enumerate(X.str.lower()) ]