Example #1
0
def preprocess_data(data,
                    use_loaded=True,
                    file_emb="./data/glove.840B.300d.txt",
                    max_num_words=50000,
                    max_len_seq=35,
                    emb_dim=300):
    # preprocess data
    file_processed_data = dir_processed + "data_processed.pkl"
    file_tokenizer = dir_processed + "tokenizer.pkl"
    file_label_index = dir_processed + "label_index.npy"
    if use_loaded:
        X, y, emb = pickle.load(open(file_processed_data, "rb"))
        tokenizer = tokenizer_from_json(
            open(file_tokenizer, "r", encoding="utf-8").read())
        label_encoder = LabelEncoder()
        label_encoder.classes_ = np.load(file_label_index)
        return X, y, emb, tokenizer, label_encoder

    cleaned_text = data["text"].apply(clean_text).values
    tokenizer = Tokenizer(num_words=max_num_words,
                          oov_token='oov_token_placeholder')
    tokenizer.fit_on_texts(list(cleaned_text))
    tokenizer_json = tokenizer.to_json(ensure_ascii=False)
    with open(file_tokenizer, 'w', encoding='utf-8') as fout:
        fout.write(tokenizer_json)

    sequences = tokenizer.texts_to_sequences(cleaned_text)
    X = pad_sequences(sequences, maxlen=max_len_seq)
    word_index = tokenizer.word_index
    num_words = len(word_index)
    print('Found %s Words' % num_words)

    print(set(data["label"].values))
    label_encoder = LabelEncoder().fit(data["label"].values)
    np.save(file_label_index, label_encoder.classes_)
    print('Found %s Classes' % len(label_encoder.classes_))
    y = label_encoder.transform(data["label"].values)

    print('Loading Word Embeddings...')
    emb = (np.random.rand(min(num_words + 1, max_num_words), emb_dim) -
           0.5) * 0.1  # +1 because idx 0 is not used
    with open(file_emb, 'r', encoding='utf-8') as fin:
        for line in fin:
            tokens = line.rstrip().split(' ')
            if tokens[0] in word_index.keys(
            ) and word_index[tokens[0]] < max_num_words:
                emb[word_index[tokens[0]]] = np.asarray(tokens[1:],
                                                        dtype='float32')

    pickle.dump((X, y, emb), open(file_processed_data, "wb"))
    return X, y, emb, tokenizer, label_encoder
Example #2
0
class ArticleThemeTokenizer:
    '''
    List of themes in the same order as in the tokenizer, which corresponds as well
    as the index of theme in the prediction
    '''
    orderedThemes: List[str]
    themes_count: int
    tokenizer: Tokenizer

    def __init__(self, articles: Articles):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(articles.themes())

        self.one_hot_matrix = self.tokenizer.texts_to_matrix(articles.themes())

        # Remove the first column, whose first col contains only 0s.
        self.one_hot_matrix = np.delete(arr=self.one_hot_matrix, obj=0, axis=1)

        # Create ordered list of theme as in tokenizer
        self.orderedThemes: List[str] = []

        for i in range(1,
                       len(self.tokenizer.word_index) +
                       1):  # word_index start at 1, 0 is reserved.
            self.orderedThemes.append(self.tokenizer.index_word[i])

        self.themes_count = len(self.tokenizer.word_index)

    def index_of_theme(self, theme: str):
        return self.tokenizer.word_index[theme] - 1

    def theme_at_index(self, index: int):
        return self.tokenizer.index_word[index + 1]

    def boolean_vector_to_themes(self,
                                 prediction_vector: List[bool]) -> List[str]:

        themes: List[str] = []

        for idx in range(0, len(prediction_vector)):
            if prediction_vector[idx]:
                # +1 because the first index (0) is reserved by default.
                themes.append(self.tokenizer.index_word[idx + 1])

        return themes

    def save(self, path: str):
        tokenizer_json = self.tokenizer.to_json()
        with io.open(path, 'w', encoding='utf-8') as f:
            f.write(tokenizer_json)
class ArticleTextTokenizer:

    voc_size: int
    document_count: int
    sequences: List[List[Optional[Any]]]

    def __init__(self, articles: Articles, max_article_length: int):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(articles.title_and_summary())
        self.max_article_length: int = max_article_length

        self.sequences = self.transform_to_sequences(articles)
        self.voc_size = len(
            self.tokenizer.word_index) + 1  # +1 because we pad with 0.
        self.document_count = self.tokenizer.document_count

    def transform_to_sequences(
            self,
            preprocessed_articles: Articles) -> List[List[Optional[Any]]]:
        """Transform articles content to a padded vector of length "max_article_length"."""
        matrix = self.tokenizer.texts_to_sequences(
            preprocessed_articles.title_and_summary())
        matrix = keras.preprocessing.sequence.pad_sequences(
            matrix, value=0, padding='post', maxlen=self.max_article_length)
        return matrix

    def transform_to_sequence(self, preprocessed_article: Article):
        """Transform a article content to a padded vector of length "max_article_length"."""
        vector = self.tokenizer.texts_to_sequences(
            [preprocessed_article.title_and_summary()])
        vector = keras.preprocessing.sequence.pad_sequences(
            vector, value=0, padding='post', maxlen=self.max_article_length)
        return vector

    def save(self, path: str):
        tokenizer_json = self.tokenizer.to_json()
        with io.open(path, 'w', encoding='utf-8') as f:
            f.write(tokenizer_json)
class DataHandler:
    def __init__(self):
        self.formula_path = os.path.join(config.dataset_path, 'formulas')
        self.images_path = os.path.join(config.dataset_path, 'images')

        self.beg_token = '<BOS>'
        self.end_token = '<EOS>'
        self.unk_token = '<UNK>'
        self.tokenizer = None

        self.__fit_tokenizer()

    def __fit_tokenizer(self):
        if os.path.isfile(config.vocab_path):
            with open(config.vocab_path, 'r') as f:
                json_content = f.read()
                self.tokenizer = tokenizer_from_json(json_content)
        else:
            tmp_doc = (self.beg_token + ' ' + self.end_token + ' ') * 100
            docs = [tmp_doc, self.__read_raw_formulas('train')]
            num_tokens = config.vocab_size - 3  # for beg, and, unk token
            self.tokenizer = Tokenizer(num_words=num_tokens,
                                       filters='\t\n',
                                       lower=False,
                                       oov_token=self.unk_token)
            self.tokenizer.fit_on_texts(docs)
            with open(config.vocab_path, 'w+') as f:
                f.write(self.tokenizer.to_json())

    def get_path(self, mode):
        formulas_path = os.path.join(self.formula_path,
                                     '{}_formulas.txt'.format(mode))
        images_folder = os.path.join(self.images_path,
                                     'images_{}'.format(mode))
        return formulas_path, images_folder

    def __read_raw_formulas(self, mode, split=False):
        path = self.get_path(mode)[0]
        try:
            with open(path, 'r') as f:
                content = f.read()
                if split:
                    lines = content.split('\n')
                    if not lines[-1]:
                        lines = lines[:-1]
                    return lines
                return content
        except:
            return [] if split else ''

    def pad_token(self):
        return self.tokenizer.word_index[self.end_token]

    def start_token(self):
        return self.tokenizer.word_index[self.beg_token]

    def read_formulas(self, mode):
        lines = self.__read_raw_formulas(mode, split=True)
        for i in range(len(lines)):
            lines[i] = '{} {} {}'.format(self.beg_token, lines[i],
                                         self.end_token)
        result = self.tokenizer.texts_to_sequences(lines)
        return result

    def read_images(self, mode, index):
        dir_path = self.get_path(mode)[1]
        images_data = []
        for i in index:
            file_path = os.path.join(dir_path, str(i) + '.png')
            if os.path.isfile(file_path):
                image = imageio.imread(file_path)
                images_data.append(image)
        data = np.array(images_data)
        data = 255 - data
        return data

    def decode_formula(self, sequences):
        def normalize(formula):
            start_idx, end_idx = 0, len(formula)
            if formula[:6] == '<BOS> ':
                start_idx = 6
            try:
                end_idx = formula.index(self.end_token)
            except:
                pass
            return formula[start_idx:end_idx]

        sequences_list = sequences.tolist()
        formulas = self.tokenizer.sequences_to_texts(sequences_list)
        formulas = [normalize(formula) for formula in formulas]
        return formulas

    def plot_sample_sizes(self):
        lines = self.__read_raw_formulas('train', split=True)
        training_size = len(lines)
        lines += self.__read_raw_formulas('validation', split=True)
        validation_size = len(lines) - training_size
        print('Training set size: ', training_size)
        print('Validation set size: ', validation_size)

        sample_sizes = []
        for l in lines:
            sample_sizes += [len(l)]

        # the histogram of the data
        n, bins, patches = plt.hist(sample_sizes,
                                    20,
                                    facecolor='g',
                                    alpha=0.75)
        plt.xlabel('length of formula')
        plt.ylabel('sample size')
        plt.title('Histogram of Length of formulas')
        plt.grid(True)
        plt.show()
Example #5
0
class NNDatasetGenerator(object):
    def __init__(self, seed: int = 123):

        self.seed = seed
        self.train_file = None
        self.test_file = None
        self.val_file = None

        self.text_col_idx = None
        self.label_col_idx = None
        self.tokenizer = None
        self.vocab = None
        self.vocab_size = None
        self.num_classes = None

        self.X_train = None
        self.y_train = None
        self.X_test = None
        self.y_test = None
        self.X_val = None
        self.y_val = None

        logging.basicConfig(
            format="%(asctime)s %(message)s",
            level=logging.DEBUG,
            datefmt="%Y-%m-%d %H:%M:%S",
        )

    def from_csv(
        self,
        train_file: str,
        test_file: str = None,
        val_file: str = None,
        val_size: float = 0.1,
        text_col_idx=0,
        label_col_idx=1,
        sep: str = ",",
        header=0,
        encoding: str = "utf8",
        preprocess_func=None,
        preprocess_ncore=2,
        ngram_range=(1, 3),
        max_features=20000,
        ds_max_seq=1000,
        ds_type="TensorDataset",
    ):

        logging.info("Starting Data Preparation...")
        start_time = time.time()

        self.train_file = train_file
        self.test_file = test_file
        self.val_file = val_file

        self.text_col_idx = text_col_idx
        self.label_col_idx = label_col_idx

        df = pd.read_csv(self.train_file,
                         sep=sep,
                         encoding=encoding,
                         header=header)
        if preprocess_func is not None:
            df[df.columns[self.text_col_idx]] = parallelApply(
                df[df.columns[self.text_col_idx]], preprocess_func,
                preprocess_ncore)
        X = df[df.columns[self.text_col_idx]].tolist()
        y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)

        del df
        logging.info("Adding 1-gram features".format(ngram_range[1]))
        self.tokenizer = Tokenizer(num_words=max_features,
                                   lower=False,
                                   filters="")
        self.tokenizer.fit_on_texts(X)
        self.X_train = self.tokenizer.texts_to_sequences(X)

        if ngram_range[1] > 1:
            logging.info("Adding N-gram features".format(ngram_range[1]))
            # Create set of unique n-gram from the training set.
            ngram_set = set()
            for input_list in self.X_train:
                for i in range(2, ngram_range[1] + 1):
                    set_of_ngram = self.create_ngram_set(input_list,
                                                         ngram_value=i)
                    ngram_set.update(set_of_ngram)

            # Dictionary mapping n-gram token to a unique integer.
            # Integer values are greater than max_features in order
            # to avoid collision with existing features.
            start_index = max_features + 1
            token_indice = {
                v: k + start_index
                for k, v in enumerate(ngram_set)
            }
            indice_token = {token_indice[k]: k for k in token_indice}

            # max_features is the highest integer that could be found in the dataset.
            max_features = np.max(list(indice_token.keys())) + 1

            # Augmenting input tokens with n-grams features
            self.X_train = self.add_ngram(self.X_train, token_indice,
                                          ngram_range[1])

        self.X_train = sequence.pad_sequences(self.X_train, maxlen=ds_max_seq)
        self.y_train = y

        self.vocab_size = max_features
        logging.info("Building final vocab...")
        vocab_wrd_idx = set()
        _ = [vocab_wrd_idx.add(idx) for sent in self.X_train for idx in sent]
        del _
        self.vocab = {
            self.tokenizer.index_word[i]: i
            for i in vocab_wrd_idx if i in self.tokenizer.index_word
        }
        # self.strt = start_index
        # if ngram_range[1] > 1:
        #     self._start = start_index
        #     a = [str(indice_token[i]) for i in range(start_index, len(vocab_wrd_idx)) if i in indice_token[i]]
        self.num_classes = len(np.unique(self.y_train))
        del X, y
        gc.collect()

        if ds_type == "TensorDataset":
            train_ds = TensorDataset(
                torch.from_numpy(self.X_train).long(),
                torch.from_numpy(self.y_train).long(),
            )
        else:
            train_ds = NNDataset(self.X_train,
                                 self.y_train,
                                 max_seq=ds_max_seq)

        if self.test_file is not None:
            df = pd.read_csv(self.test_file,
                             sep=sep,
                             encoding=encoding,
                             header=header)
            if preprocess_func is not None:
                df[df.columns[self.text_col_idx]] = parallelApply(
                    df[df.columns[self.text_col_idx]], preprocess_func,
                    preprocess_ncore)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)
            del df
            self.X_test = self.tokenizer.texts_to_sequences(X)
            if ngram_range[1] > 1:
                self.X_test = self.add_ngram(self.X_test, token_indice,
                                             ngram_range[1])
            self.X_test = sequence.pad_sequences(self.X_train,
                                                 maxlen=ds_max_seq)
            self.y_test = y
            del X, y
            gc.collect()
            if ds_type == "TensorDataset":
                test_ds = TensorDataset(
                    torch.from_numpy(self.X_test).long(),
                    torch.from_numpy(self.y_test).long(),
                )
            else:
                test_ds = NNDataset(self.X_test,
                                    self.y_test,
                                    max_seq=ds_max_seq)

        if self.val_file is not None:
            df = pd.read_csv(self.val_file, sep=sep, encoding=encoding)
            if preprocess_func is not None:
                df[df.columns[self.text_col_idx]] = parallelApply(
                    df[df.columns[self.text_col_idx]], preprocess_func,
                    preprocess_ncore)
            X = df[df.columns[self.text_col_idx]].tolist()
            y = df[df.columns[self.label_col_idx]].to_numpy(dtype=int)
            del df
            self.X_val = self.tokenizer.texts_to_sequences(X)
            if ngram_range[1] > 1:
                self.X_val = self.add_ngram(self.X_val, token_indice,
                                            ngram_range[1])
            self.X_val = sequence.pad_sequences(self.X_val, maxlen=ds_max_seq)
            self.y_val = y
            del X, y
            gc.collect()

            if ds_type == "TensorDataset":
                val_ds = TensorDataset(
                    torch.from_numpy(self.X_val).long(),
                    torch.from_numpy(self.y_val).long(),
                )
            else:
                val_ds = NNDataset(self.X_val, self.y_val, max_seq=ds_max_seq)

        logging.info("Data Preparation Completed - Time elapsed: " +
                     get_elapsed_time(start_time))

        if self.val_file is not None:
            if self.test_file is not None:
                return train_ds, test_ds, val_ds
            else:
                return train_ds, val_ds
        else:
            return train_ds

    def create_ngram_set(self, input_list, ngram_value=2):
        return set(zip(*[input_list[i:] for i in range(ngram_value)]))

    def add_ngram(self, sequences, token_indice, ngram_range=2):
        new_sequences = []
        for input_list in sequences:
            new_list = input_list[:]
            for ngram_value in range(2, ngram_range + 1):
                for i in range(len(new_list) - ngram_value + 1):
                    ngram = tuple(new_list[i:i + ngram_value])
                    if ngram in token_indice:
                        new_list.append(token_indice[ngram])
            new_sequences.append(new_list)

        return new_sequences

    def to_numpy(self, dest_folder: str):

        logging.info("Starting Data Export...")
        start_time = time.time()

        if self.tokenizer is not None:
            self.tokenizer.to_json(os.path.join(dest_folder, "tokenizer.json"))

        if self.X_train is not None:
            np.savez_compressed(
                os.path.join(dest_folder, "train_data_nn.npz"),
                X=self.X_train,
                y=self.y_train,
            )

        if self.X_test is not None:
            np.savez_compressed(
                os.path.join(dest_folder, "test_data_nn.npz"),
                X=self.X_test,
                y=self.y_test,
            )

        if self.X_val is not None:
            np.savez_compressed(os.path.join(dest_folder, "val_data_nn.npz"),
                                X=self.X_val,
                                y=self.y_val)

        logging.info("Data Export Completed - Time elapsed: " +
                     get_elapsed_time(start_time))

    def from_numpy(
        self,
        train_data_file: str,
        test_data_file: str = None,
        val_data_file: str = None,
        ds_type="TensorDataset",
    ):

        logging.info("Starting Data Preparation...")
        start_time = time.time()

        self.tokenizer = text.tokenizer_from_json()

        train_npz = np.load(train_data_file, allow_pickle=True)
        self.X_train = train_npz["X"].item()
        self.y_train = train_npz["y"]

        self.num_classes = len(np.unique(self.y_train))
        self.vocab_size = np.shape(self.X_train)[1]

        train_ds = CSRDataset(self.X_train, self.y_train)

        if test_data_file is not None:
            test_npz = np.load(test_data_file, allow_pickle=True)
            self.X_test = test_npz["X"].item()
            self.y_test = test_npz["y"]

            test_ds = CSRDataset(self.X_test, self.y_test)

        if val_data_file is not None:
            val_npz = np.load(val_data_file, allow_pickle=True)
            self.X_val = val_npz["X"].item()
            self.y_val = val_npz["y"]

            val_ds = CSRDataset(self.X_val, self.y_val)

        logging.info("Data Import Completed - Time elapsed: " +
                     get_elapsed_time(start_time))

        if val_data_file is not None:
            if test_data_file is not None:
                return train_ds, val_ds, test_ds
            else:
                return train_ds, val_ds
        else:
            return train_ds