def preprocess_data(data, use_loaded=True, file_emb="./data/glove.840B.300d.txt", max_num_words=50000, max_len_seq=35, emb_dim=300): # preprocess data file_processed_data = dir_processed + "data_processed.pkl" file_tokenizer = dir_processed + "tokenizer.pkl" file_label_index = dir_processed + "label_index.npy" if use_loaded: X, y, emb = pickle.load(open(file_processed_data, "rb")) tokenizer = tokenizer_from_json( open(file_tokenizer, "r", encoding="utf-8").read()) label_encoder = LabelEncoder() label_encoder.classes_ = np.load(file_label_index) return X, y, emb, tokenizer, label_encoder cleaned_text = data["text"].apply(clean_text).values tokenizer = Tokenizer(num_words=max_num_words, oov_token='oov_token_placeholder') tokenizer.fit_on_texts(list(cleaned_text)) tokenizer_json = tokenizer.to_json(ensure_ascii=False) with open(file_tokenizer, 'w', encoding='utf-8') as fout: fout.write(tokenizer_json) sequences = tokenizer.texts_to_sequences(cleaned_text) X = pad_sequences(sequences, maxlen=max_len_seq) word_index = tokenizer.word_index num_words = len(word_index) print('Found %s Words' % num_words) print(set(data["label"].values)) label_encoder = LabelEncoder().fit(data["label"].values) np.save(file_label_index, label_encoder.classes_) print('Found %s Classes' % len(label_encoder.classes_)) y = label_encoder.transform(data["label"].values) print('Loading Word Embeddings...') emb = (np.random.rand(min(num_words + 1, max_num_words), emb_dim) - 0.5) * 0.1 # +1 because idx 0 is not used with open(file_emb, 'r', encoding='utf-8') as fin: for line in fin: tokens = line.rstrip().split(' ') if tokens[0] in word_index.keys( ) and word_index[tokens[0]] < max_num_words: emb[word_index[tokens[0]]] = np.asarray(tokens[1:], dtype='float32') pickle.dump((X, y, emb), open(file_processed_data, "wb")) return X, y, emb, tokenizer, label_encoder
class ArticleThemeTokenizer: ''' List of themes in the same order as in the tokenizer, which corresponds as well as the index of theme in the prediction ''' orderedThemes: List[str] themes_count: int tokenizer: Tokenizer def __init__(self, articles: Articles): self.tokenizer = Tokenizer() self.tokenizer.fit_on_texts(articles.themes()) self.one_hot_matrix = self.tokenizer.texts_to_matrix(articles.themes()) # Remove the first column, whose first col contains only 0s. self.one_hot_matrix = np.delete(arr=self.one_hot_matrix, obj=0, axis=1) # Create ordered list of theme as in tokenizer self.orderedThemes: List[str] = [] for i in range(1, len(self.tokenizer.word_index) + 1): # word_index start at 1, 0 is reserved. self.orderedThemes.append(self.tokenizer.index_word[i]) self.themes_count = len(self.tokenizer.word_index) def index_of_theme(self, theme: str): return self.tokenizer.word_index[theme] - 1 def theme_at_index(self, index: int): return self.tokenizer.index_word[index + 1] def boolean_vector_to_themes(self, prediction_vector: List[bool]) -> List[str]: themes: List[str] = [] for idx in range(0, len(prediction_vector)): if prediction_vector[idx]: # +1 because the first index (0) is reserved by default. themes.append(self.tokenizer.index_word[idx + 1]) return themes def save(self, path: str): tokenizer_json = self.tokenizer.to_json() with io.open(path, 'w', encoding='utf-8') as f: f.write(tokenizer_json)
class ArticleTextTokenizer: voc_size: int document_count: int sequences: List[List[Optional[Any]]] def __init__(self, articles: Articles, max_article_length: int): self.tokenizer = Tokenizer() self.tokenizer.fit_on_texts(articles.title_and_summary()) self.max_article_length: int = max_article_length self.sequences = self.transform_to_sequences(articles) self.voc_size = len( self.tokenizer.word_index) + 1 # +1 because we pad with 0. self.document_count = self.tokenizer.document_count def transform_to_sequences( self, preprocessed_articles: Articles) -> List[List[Optional[Any]]]: """Transform articles content to a padded vector of length "max_article_length".""" matrix = self.tokenizer.texts_to_sequences( preprocessed_articles.title_and_summary()) matrix = keras.preprocessing.sequence.pad_sequences( matrix, value=0, padding='post', maxlen=self.max_article_length) return matrix def transform_to_sequence(self, preprocessed_article: Article): """Transform a article content to a padded vector of length "max_article_length".""" vector = self.tokenizer.texts_to_sequences( [preprocessed_article.title_and_summary()]) vector = keras.preprocessing.sequence.pad_sequences( vector, value=0, padding='post', maxlen=self.max_article_length) return vector def save(self, path: str): tokenizer_json = self.tokenizer.to_json() with io.open(path, 'w', encoding='utf-8') as f: f.write(tokenizer_json)
class DataHandler: def __init__(self): self.formula_path = os.path.join(config.dataset_path, 'formulas') self.images_path = os.path.join(config.dataset_path, 'images') self.beg_token = '<BOS>' self.end_token = '<EOS>' self.unk_token = '<UNK>' self.tokenizer = None self.__fit_tokenizer() def __fit_tokenizer(self): if os.path.isfile(config.vocab_path): with open(config.vocab_path, 'r') as f: json_content = f.read() self.tokenizer = tokenizer_from_json(json_content) else: tmp_doc = (self.beg_token + ' ' + self.end_token + ' ') * 100 docs = [tmp_doc, self.__read_raw_formulas('train')] num_tokens = config.vocab_size - 3 # for beg, and, unk token self.tokenizer = Tokenizer(num_words=num_tokens, filters='\t\n', lower=False, oov_token=self.unk_token) self.tokenizer.fit_on_texts(docs) with open(config.vocab_path, 'w+') as f: f.write(self.tokenizer.to_json()) def get_path(self, mode): formulas_path = os.path.join(self.formula_path, '{}_formulas.txt'.format(mode)) images_folder = os.path.join(self.images_path, 'images_{}'.format(mode)) return formulas_path, images_folder def __read_raw_formulas(self, mode, split=False): path = self.get_path(mode)[0] try: with open(path, 'r') as f: content = f.read() if split: lines = content.split('\n') if not lines[-1]: lines = lines[:-1] return lines return content except: return [] if split else '' def pad_token(self): return self.tokenizer.word_index[self.end_token] def start_token(self): return self.tokenizer.word_index[self.beg_token] def read_formulas(self, mode): lines = self.__read_raw_formulas(mode, split=True) for i in range(len(lines)): lines[i] = '{} {} {}'.format(self.beg_token, lines[i], self.end_token) result = self.tokenizer.texts_to_sequences(lines) return result def read_images(self, mode, index): dir_path = self.get_path(mode)[1] images_data = [] for i in index: file_path = os.path.join(dir_path, str(i) + '.png') if os.path.isfile(file_path): image = imageio.imread(file_path) images_data.append(image) data = np.array(images_data) data = 255 - data return data def decode_formula(self, sequences): def normalize(formula): start_idx, end_idx = 0, len(formula) if formula[:6] == '<BOS> ': start_idx = 6 try: end_idx = formula.index(self.end_token) except: pass return formula[start_idx:end_idx] sequences_list = sequences.tolist() formulas = self.tokenizer.sequences_to_texts(sequences_list) formulas = [normalize(formula) for formula in formulas] return formulas def plot_sample_sizes(self): lines = self.__read_raw_formulas('train', split=True) training_size = len(lines) lines += self.__read_raw_formulas('validation', split=True) validation_size = len(lines) - training_size print('Training set size: ', training_size) print('Validation set size: ', validation_size) sample_sizes = [] for l in lines: sample_sizes += [len(l)] # the histogram of the data n, bins, patches = plt.hist(sample_sizes, 20, facecolor='g', alpha=0.75) plt.xlabel('length of formula') plt.ylabel('sample size') plt.title('Histogram of Length of formulas') plt.grid(True) plt.show()
class NNDatasetGenerator(object): def __init__(self, seed: int = 123): self.seed = seed self.train_file = None self.test_file = None self.val_file = None self.text_col_idx = None self.label_col_idx = None self.tokenizer = None self.vocab = None self.vocab_size = None self.num_classes = None self.X_train = None self.y_train = None self.X_test = None self.y_test = None self.X_val = None self.y_val = None logging.basicConfig( format="%(asctime)s %(message)s", level=logging.DEBUG, datefmt="%Y-%m-%d %H:%M:%S", ) def from_csv( self, train_file: str, test_file: str = None, val_file: str = None, val_size: float = 0.1, text_col_idx=0, label_col_idx=1, sep: str = ",", header=0, encoding: str = "utf8", preprocess_func=None, preprocess_ncore=2, ngram_range=(1, 3), max_features=20000, ds_max_seq=1000, ds_type="TensorDataset", ): logging.info("Starting Data Preparation...") start_time = time.time() self.train_file = train_file self.test_file = test_file self.val_file = val_file self.text_col_idx = text_col_idx self.label_col_idx = label_col_idx df = pd.read_csv(self.train_file, sep=sep, encoding=encoding, header=header) if preprocess_func is not None: df[df.columns[self.text_col_idx]] = parallelApply( df[df.columns[self.text_col_idx]], preprocess_func, preprocess_ncore) X = df[df.columns[self.text_col_idx]].tolist() y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int) del df logging.info("Adding 1-gram features".format(ngram_range[1])) self.tokenizer = Tokenizer(num_words=max_features, lower=False, filters="") self.tokenizer.fit_on_texts(X) self.X_train = self.tokenizer.texts_to_sequences(X) if ngram_range[1] > 1: logging.info("Adding N-gram features".format(ngram_range[1])) # Create set of unique n-gram from the training set. ngram_set = set() for input_list in self.X_train: for i in range(2, ngram_range[1] + 1): set_of_ngram = self.create_ngram_set(input_list, ngram_value=i) ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer. # Integer values are greater than max_features in order # to avoid collision with existing features. start_index = max_features + 1 token_indice = { v: k + start_index for k, v in enumerate(ngram_set) } indice_token = {token_indice[k]: k for k in token_indice} # max_features is the highest integer that could be found in the dataset. max_features = np.max(list(indice_token.keys())) + 1 # Augmenting input tokens with n-grams features self.X_train = self.add_ngram(self.X_train, token_indice, ngram_range[1]) self.X_train = sequence.pad_sequences(self.X_train, maxlen=ds_max_seq) self.y_train = y self.vocab_size = max_features logging.info("Building final vocab...") vocab_wrd_idx = set() _ = [vocab_wrd_idx.add(idx) for sent in self.X_train for idx in sent] del _ self.vocab = { self.tokenizer.index_word[i]: i for i in vocab_wrd_idx if i in self.tokenizer.index_word } # self.strt = start_index # if ngram_range[1] > 1: # self._start = start_index # a = [str(indice_token[i]) for i in range(start_index, len(vocab_wrd_idx)) if i in indice_token[i]] self.num_classes = len(np.unique(self.y_train)) del X, y gc.collect() if ds_type == "TensorDataset": train_ds = TensorDataset( torch.from_numpy(self.X_train).long(), torch.from_numpy(self.y_train).long(), ) else: train_ds = NNDataset(self.X_train, self.y_train, max_seq=ds_max_seq) if self.test_file is not None: df = pd.read_csv(self.test_file, sep=sep, encoding=encoding, header=header) if preprocess_func is not None: df[df.columns[self.text_col_idx]] = parallelApply( df[df.columns[self.text_col_idx]], preprocess_func, preprocess_ncore) X = df[df.columns[self.text_col_idx]].tolist() y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int) del df self.X_test = self.tokenizer.texts_to_sequences(X) if ngram_range[1] > 1: self.X_test = self.add_ngram(self.X_test, token_indice, ngram_range[1]) self.X_test = sequence.pad_sequences(self.X_train, maxlen=ds_max_seq) self.y_test = y del X, y gc.collect() if ds_type == "TensorDataset": test_ds = TensorDataset( torch.from_numpy(self.X_test).long(), torch.from_numpy(self.y_test).long(), ) else: test_ds = NNDataset(self.X_test, self.y_test, max_seq=ds_max_seq) if self.val_file is not None: df = pd.read_csv(self.val_file, sep=sep, encoding=encoding) if preprocess_func is not None: df[df.columns[self.text_col_idx]] = parallelApply( df[df.columns[self.text_col_idx]], preprocess_func, preprocess_ncore) X = df[df.columns[self.text_col_idx]].tolist() y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int) del df self.X_val = self.tokenizer.texts_to_sequences(X) if ngram_range[1] > 1: self.X_val = self.add_ngram(self.X_val, token_indice, ngram_range[1]) self.X_val = sequence.pad_sequences(self.X_val, maxlen=ds_max_seq) self.y_val = y del X, y gc.collect() if ds_type == "TensorDataset": val_ds = TensorDataset( torch.from_numpy(self.X_val).long(), torch.from_numpy(self.y_val).long(), ) else: val_ds = NNDataset(self.X_val, self.y_val, max_seq=ds_max_seq) logging.info("Data Preparation Completed - Time elapsed: " + get_elapsed_time(start_time)) if self.val_file is not None: if self.test_file is not None: return train_ds, test_ds, val_ds else: return train_ds, val_ds else: return train_ds def create_ngram_set(self, input_list, ngram_value=2): return set(zip(*[input_list[i:] for i in range(ngram_value)])) def add_ngram(self, sequences, token_indice, ngram_range=2): new_sequences = [] for input_list in sequences: new_list = input_list[:] for ngram_value in range(2, ngram_range + 1): for i in range(len(new_list) - ngram_value + 1): ngram = tuple(new_list[i:i + ngram_value]) if ngram in token_indice: new_list.append(token_indice[ngram]) new_sequences.append(new_list) return new_sequences def to_numpy(self, dest_folder: str): logging.info("Starting Data Export...") start_time = time.time() if self.tokenizer is not None: self.tokenizer.to_json(os.path.join(dest_folder, "tokenizer.json")) if self.X_train is not None: np.savez_compressed( os.path.join(dest_folder, "train_data_nn.npz"), X=self.X_train, y=self.y_train, ) if self.X_test is not None: np.savez_compressed( os.path.join(dest_folder, "test_data_nn.npz"), X=self.X_test, y=self.y_test, ) if self.X_val is not None: np.savez_compressed(os.path.join(dest_folder, "val_data_nn.npz"), X=self.X_val, y=self.y_val) logging.info("Data Export Completed - Time elapsed: " + get_elapsed_time(start_time)) def from_numpy( self, train_data_file: str, test_data_file: str = None, val_data_file: str = None, ds_type="TensorDataset", ): logging.info("Starting Data Preparation...") start_time = time.time() self.tokenizer = text.tokenizer_from_json() train_npz = np.load(train_data_file, allow_pickle=True) self.X_train = train_npz["X"].item() self.y_train = train_npz["y"] self.num_classes = len(np.unique(self.y_train)) self.vocab_size = np.shape(self.X_train)[1] train_ds = CSRDataset(self.X_train, self.y_train) if test_data_file is not None: test_npz = np.load(test_data_file, allow_pickle=True) self.X_test = test_npz["X"].item() self.y_test = test_npz["y"] test_ds = CSRDataset(self.X_test, self.y_test) if val_data_file is not None: val_npz = np.load(val_data_file, allow_pickle=True) self.X_val = val_npz["X"].item() self.y_val = val_npz["y"] val_ds = CSRDataset(self.X_val, self.y_val) logging.info("Data Import Completed - Time elapsed: " + get_elapsed_time(start_time)) if val_data_file is not None: if test_data_file is not None: return train_ds, val_ds, test_ds else: return train_ds, val_ds else: return train_ds