def main(): batch_size = 128 epochs = 100 maxlen = 300 model_path = "cnn_model.h5" num_words = 40000 num_label = 2 x, y = load_dataset("data/amazon_reviews_multilingual_JP_v1_00.tsv") x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post") x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post") wv = load_fasttext("data/cc.ja.300.vec.gz") wv = filter_embeddings(wv, vocab.word_index, num_words) model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"]) callbakcs = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbakcs, shuffle=True) model = load_model(model_path) api = InferenceAPI(model, vocab, preprocess_dataset) y_pred = api.predict_from_sequence(x_test) print("precision: {:.4f}".format( precision_score(y_test, y_pred, average="binary"))) print("recall: {:.4f}".format( recall_score(y_test, y_pred, average="binary"))) print("f1: {:.4f}".format(f1_score(y_test, y_pred, average="binary")))
def train(): df_tweets = pd.read_csv("data/df_tweets", index_col=0) df_tweets["text"] = preprocess_dataset(df_tweets["text"]) df_tweets = df_tweets.dropna(how='any') df_tweets = df_tweets.drop(df_tweets.index[df_tweets["Irrelevant"] == 1]) x = df_tweets["text"] # y = df_tweets[["posi_and_nega", "posi", "nega", "neutral", "Irrelevant"]] y = df_tweets[["posi_and_nega", "posi", "nega", "neutral"]] y = np.asarray(y) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) vocab = build_vocabulary(x_train, num_words) with open('model/tokenizer.pickle', 'wb') as handle: pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL) x_train = vocab.texts_to_sequences(x_train) x_test = vocab.texts_to_sequences(x_test) x_train = pad_sequences(x_train, maxlen=maxlen, truncating="post") x_test = pad_sequences(x_test, maxlen=maxlen, truncating="post") wv = KeyedVectors.load("model/word2vec.model", mmap='r') wv = filter_embeddings(wv, vocab.word_index, num_words) model = CNNModel(num_words, num_label, embeddings=wv).build() model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"]) callbakcs = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.5, callbacks=callbakcs, shuffle=True)
class DarkOCR: def __init__(self): print('DarkOCR initialization...') self.data = ImageData() # reading data self.data.read_origin_data(pickle_path) self.model = CNNModel(image_dim, image_dim, classes_count) self.models_fold = [ CNNModel(image_dim, image_dim, classes_count) for i in range(fold_count) ] print('Complete') def show_origin_data(self, char='p'): # visualization self.data.show_origin_all_chars() self.data.show_origin_chars_data() self.data.show_origin_chars_data(char) def show_origin_data_statistics(self): self.data.show_origin_data_histogram() self.data.print_origin_labels_count() def save_data_set_to_png(self, path): self.data.save_data_set_to_png(path) def augment_folder(self, path, char_i=None, generated_count=50): pixels_mean = None if char_i is not None: path += '/' + str(char_i) pixels_mean_per_class = self.data.calc_pixels_mean() pixels_mean = pixels_mean_per_class[char_i] augment_folder(path, generated_count=generated_count, pixels_mean=pixels_mean) def fit_from_aug_folder(self, path=png_path): data_set = self.data.read_augmented_data_and_process( in_path=path, classes_count_int=4) self.fit(data_set) def fit_from_aug_pickle(self, aug_pickle_path=augmented_pickle_path, test_fold=4): data_set = self.data.read_pickle(aug_pickle_path) self.fit(data_set, test_fold=test_fold) def fit(self, data_set, test_fold=4): (train_x, train_y), (test_x, test_y) = self.data.from_processed_data_to_training_set( data_set=data_set, test_fold=test_fold, ignore_class=30) self.model.fit(train_x, train_y, test_x, test_y) def load_trained_models_group(self): print('Loading models group...') for fold in range(fold_count): self.models_fold[fold].load_model(fold=fold) print('Done') def predict(self, input_data): prediction = self.model.predict(input_data) return np.argmax(prediction, axis=1) def predict_from_fold(self, input_data, fold=4): prediction = self.models_fold[fold].predict(input_data) return np.argmax(prediction, axis=1) def predict_from_group(self, input_data): # print('Making prediction...') prediction_votes = np.zeros((len(input_data), classes_count)) for fold in range(fold_count): prediction_votes += self.models_fold[fold].predict(input_data) return np.argmax(prediction_votes, axis=1) def predict_image(self, im, decode=False): im = im.convert("L") ia = np.array(im, dtype='d') ia = ia / 255 ia = ia.reshape(-1, image_dim, image_dim, 1) prediction = self.predict_from_group(ia) if decode: prediction = ImageData.decode(prediction[0]) return prediction def evaluate_by_image_folder(self, path): correct_count = 0 examples_count = 0 answers_counter = [0] * classes_count correct_counter = [0] * classes_count for im_path in glob.glob(path + '/*.png'): hash_i = im_path.rfind("#") label = im_path[hash_i + 1] im = Image.open(im_path) prediction = self.predict_image(im, decode=True) if label == prediction: correct_count += 1 correct_counter[ImageData.encode(label)] += 1 else: print("WRONG!: {}, LABEL: {}, PREDICTION: {}".format( im_path, label, prediction)) answers_counter[ImageData.encode(prediction)] += 1 examples_count += 1 accuracy = 100 * correct_count / examples_count print('Results: {:.2f}%'.format(accuracy)) for i in range(len(answers_counter)): print('{} ({}). correct: {}, answers count: {}'.format( i, ImageData.decode(i), correct_counter[i], answers_counter[i])) return accuracy