def text_classification(self, num_classes: int = None, multi_label: bool = False, **kwargs) -> ak.TextClassifier: """Text Classification. Args: num_classes (int, optional): Number of classes. Defaults to None. multi_label (bool, optional): The target is multi-labeled. Defaults to False. Returns: ak.TextClassifier: AutoKERAS text classification class. """ return ak.TextClassifier( num_classes=num_classes, multi_label=multi_label, loss=self.loss, metrics=self.metrics, project_name=self.project_name, max_trials=self.max_trials, directory=self.directory, objective=self.objective, tuner=self.tuner, overwrite=self.overwrite, seed=self.seed, max_model_size=self.max_model_size, **kwargs, )
def experiment02(self): self.load_trainingset(False) model = ak.TextClassifier() model.fit(self.c_x_train, self.c_y_train) y_pred = model.predict(self.c_x_test)
def main(): # Loads dataset. # FIXME [implement] >> """ (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() x_train = x_train.reshape(x_train.shape + (1,)) x_test = x_test.reshape(x_test.shape + (1,)) """ #-------------------- clf = ak.TextClassifier(verbose=True) print('Fitting...') start_time = time.time() clf.fit(x_train, y_train, time_limit=12 * 60 * 60) # time_limit in secs. print('\tElapsed time = {}'.format(time.time() - start_time)) print('Final Fitting...') start_time = time.time() clf.final_fit(x_train, y_train, x_test, y_test, retrain=True) print('\tElapsed time = {}'.format(time.time() - start_time)) print('Evaluating...') start_time = time.time() accuracy = clf.evaluate(x_test, y_test) print('\tElapsed time = {}'.format(time.time() - start_time)) print('Accuracy =', accuracy * 100) print('Predicting...') start_time = time.time() predictions = clf.predict(x_test) print('\tElapsed time = {}'.format(time.time() - start_time)) print('Predictions =', predictions)
def train_model(df): embeddings_index = get_embeddings_index() reviews = df['review'] labels = df['label'] num_words = len([word for sentence in reviews for word in sentence.split(' ')]) # # getting the biggest sentence length for padding max_num_words = max([len(sentence.split()) for sentence in reviews]) tokenizer = Tokenizer(num_words=max_num_words) tokenizer.fit_on_texts(reviews) x_train = tokenizer.texts_to_sequences(reviews) # x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # y_train = to_categorical(np.asarray(labels)) y_train = np.asarray(labels) word_index = tokenizer.word_index word_index["<PAD>"] = 0 word_index["<START>"] = 1 word_index["<UNK>"] = 2 id_to_word = {value: key for key, value in word_index.items()} # Convert the word indices to words. validation_split = 0.2 x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=validation_split, random_state=1) x_train = list(map(lambda sentence: ' '.join( id_to_word[i] for i in sentence), x_train)) x_test = list(map(lambda sentence: ' '.join( id_to_word[i] for i in sentence), x_test)) x_train = np.array(x_train, dtype=np.str) x_test = np.array(x_test, dtype=np.str) y_train = np.asarray(y_train) y_test = np.asarray(y_test) # print('Found %s unique tokens.' % len(word_index)) # data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) # labels = to_categorical(np.asarray(labels)) # # print('Shape of data tensor:', data.shape) # print('Shape of label tensor:', labels.shape) # # # split the data into a training set and a validation set # indices = np.arange(data.shape[0]) # np.random.shuffle(indices) # data = data[indices] # labels = labels[indices] # x_train, x_test, y_train, y_test = train_test_split(data, labels, # test_size=0.2, random_state=1) # # x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, # test_size=0.2, random_state=1) import autokeras as ak text_classifier = ak.TextClassifier(max_trials = 10) # x = np.asarray(df['review'].values) # x = np.asarray(list(map(np.str_, x))) # y = np.asarray(df['label'].values) text_classifier.fit(x_train,y_train, epochs = 5) predicted_y = text_classifier.predict(x_test) # Evaluate the best model with testing data. print(text_classifier.evaluate(x_test, y_test))
def test_text_classifier(tmp_path): (train_x, train_y), (test_x, test_y) = utils.imdb_raw() clf = ak.TextClassifier(directory=tmp_path, max_trials=2, seed=utils.SEED, metrics=['accuracy'], objective='accuracy') clf.fit(train_x, train_y, epochs=2, validation_data=(test_x, test_y)) clf.export_model() assert clf.predict(test_x).shape == (len(test_x), 1) assert clf.tuner._get_best_trial_epochs() == 2
def test_txt_clf_init_hp0_equals_hp_of_a_model(tmp_path): clf = ak.TextClassifier(directory=tmp_path) clf.inputs[0].shape = (1, ) clf.outputs[0].in_blocks[0].output_shape = (10, ) init_hp = task_specific.TEXT_CLASSIFIER[0] hp = kerastuner.HyperParameters() hp.values = copy.copy(init_hp) clf.tuner.hypermodel.build(hp) assert set(init_hp.keys()) == set(hp._hps.keys())
def main(): (x_train, y_train), (x_test, y_test) = imdb_raw() clf = ak.TextClassifier(max_trials=10, directory='tmp_dir', overwrite=True) start_time = timeit.default_timer() clf.fit(x_train, y_train) stop_time = timeit.default_timer() accuracy = clf.evaluate(x_test, y_test)[1] print('Accuracy: {accuracy}%'.format(accuracy=round(accuracy * 100, 2))) print('Total time: {time} seconds.'.format( time=round(stop_time - start_time, 2)))
def genre_prediction(): vectorizer = MultiVectorizer() genre_prediction = GenrePredictionModel(vectorizer=vectorizer) training_data_df, validation_data_df = genre_prediction.load_data("data/film_data_lots.xlsx", no_sentences=True) clf = ak.TextClassifier(max_trials=4, multi_label=True) X_train = np.array(training_data_df["Subtitles"].tolist()) y_train = genre_prediction.training_labels X_validation = np.array(validation_data_df["Subtitles"].tolist()) y_validation = genre_prediction.validation_labels clf.fit(X_train, y_train, validation_data=(X_validation, y_validation))
def test_txt_clf_init_hp2_equals_hp_of_a_model(tmp_path): clf = ak.TextClassifier(directory=tmp_path) clf.inputs[0].shape = (1, ) clf.inputs[0].batch_size = 6 clf.inputs[0].num_samples = 1000 clf.outputs[0].in_blocks[0].shape = (10, ) clf.tuner.hypermodel.hypermodel.epochs = 1000 clf.tuner.hypermodel.hypermodel.num_samples = 20000 init_hp = task_specific.TEXT_CLASSIFIER[2] hp = keras_tuner.HyperParameters() hp.values = copy.copy(init_hp) clf.tuner.hypermodel.build(hp) assert set(init_hp.keys()) == set(hp._hps.keys())
def run_auto_keras(): df = pd.read_csv("./data/02/emotions_full.csv", index_col=0) y = np.array(df["sentiment"].astype("str")) X = np.array(df["lemma"].astype("str")) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1) weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) class_weights = dict(zip([i for i in range(len(weights))], weights)) # Initialize the structured data classifier. model = ak.TextClassifier(overwrite=True, max_trials=4, metrics="accuracy", objective=kt.Objective("accuracy", direction="max"), loss="categorical_crossentropy") early_stopping = tf.keras.callbacks.EarlyStopping( min_delta=0.001, # minimium amount of change to count as an improvement patience=3, # how many epochs to wait before stopping restore_best_weights=True, ) model.fit(X_train, y_train, epochs=100, class_weight=class_weights, callbacks=[early_stopping]) # Export as a Keras Model. model.export_model() print( type(model)) # <class 'tensorflow.python.keras.engine.training.Model'> try: model.save("./models/model_autokeras", save_format="tf") except Exception: model.save("./models/model_autokeras.h5")
def test_text_classifier(tmp_path): train_x = utils.generate_text_data(num_instances=320) train_y = np.random.randint(0, 2, 320) test_x = train_x test_y = train_y clf = ak.TextClassifier( directory=tmp_path, max_trials=2, seed=utils.SEED, metrics=["accuracy"], objective="accuracy", ) clf.fit( train_x, train_y, epochs=2, validation_data=(test_x, test_y), batch_size=6 ) clf.export_model() assert clf.predict(test_x).shape == (len(test_x), 1) assert clf.tuner._get_best_trial_epochs() <= 2
def main( input_filepath: str = typer.Argument( ..., help="Filepath to the TSV-formatted train dataset."), output_directory: str = typer.Argument( "./", help= ("Directory to save the output generated during the search. The best model will be saved" " as 'output_directory/model_autokeras' or 'output_directory/model_autokeras.h5'" ), ), max_trials: int = typer.Option( 1000, help=("The maximum number of different Keras Models to try." " The search may finish before reaching the max_trials."), ), ): df = pd.read_csv(input_filepath, sep="\t", header=None, names=["text", "labels"]) X = df["text"].values.astype(str) y = df["labels"].values output_directory = Path(output_directory) output_directory.mkdir(parents=True, exist_ok=True) # Hardcode max_model_size to the upper bound of the first models AutoKeras tries. clf = ak.TextClassifier(max_trials=max_trials, directory=output_directory, seed=RANDOM_STATE) clf.fit(X, y) model = clf.export_model() try: output_filepath = output_directory / "model_autokeras" model.save(output_filepath, save_format="tf") except ImportError: output_filepath = output_directory / "model_autokeras.h5" model.save(output_filepath) typer.secho( f"Best model saved to {output_filepath.absolute()}.", bold=True, )
def __init__(self, model_pars=None, data_pars=None, compute_pars=None, out_pars=None): ### Model Structure ################################ if model_pars is None: self.model = None return self # Initialize the text classifier. # It tries n different models. if model_pars["model_name"] == "text": # Initialize the TextClassifier self.model = ak.TextClassifier(max_trials=model_pars['max_trials']) elif model_pars["model_name"] == "vision": # Initialize the ImageClassifier. self.model = ak.ImageClassifier( max_trials=model_pars['max_trials']) elif model_pars["model_name"] == "tabular_classifier": # Initialize the classifier. self.model = ak.StructuredDataClassifier( max_trials=model_pars['max_trials'])
map(lambda sentence: ' '.join(id_to_word[i] for i in sentence), x_train)) x_test = list( map(lambda sentence: ' '.join(id_to_word[i] for i in sentence), x_test)) x_train = np.array(x_train, dtype=np.str) x_test = np.array(x_test, dtype=np.str) print(x_train.shape) # (25000,) print(y_train.shape) # (25000, 1) print(x_train[0][:50]) # <START> this film was just brilliant casting <UNK> """ The second step is to run the [TextClassifier](/text_classifier). """ import autokeras as ak # Initialize the text classifier. clf = ak.TextClassifier(max_trials=1) # It tries 10 different models. # Feed the text classifier with training data. clf.fit(x_train, y_train, epochs=2) # Predict with the best model. predicted_y = clf.predict(x_test) # Evaluate the best model with testing data. print(clf.evaluate(x_test, y_test)) """ ## Validation Data By default, AutoKeras use the last 20% of training data as validation data. As shown in the example below, you can use `validation_split` to specify the percentage. """ clf.fit( x_train, y_train,
x_test = np.array(test_data.data) y_test = np.array(test_data.target) print(x_train.shape) # (25000,) print(y_train.shape) # (25000, 1) print(x_train[0][:50]) # this film was just brilliant casting """ The second step is to run the [TextClassifier](/text_classifier). As a quick demo, we set epochs to 2. You can also leave the epochs unspecified for an adaptive number of epochs. """ import autokeras as ak # Initialize the text classifier. clf = ak.TextClassifier(overwrite=True, max_trials=1) # It only tries 1 model as a quick demo. # Feed the text classifier with training data. clf.fit(x_train, y_train, epochs=2) # Predict with the best model. predicted_y = clf.predict(x_test) # Evaluate the best model with testing data. print(clf.evaluate(x_test, y_test)) """ ## Validation Data By default, AutoKeras use the last 20% of training data as validation data. As shown in the example below, you can use `validation_split` to specify the percentage. """ clf.fit( x_train, y_train,
# Separate labels and features X_train = df_train['Sentence'] y_train = df_train['Polarity'] # Load Testing data print('Reading test set...', end='') df_test = pd.read_csv("data/sentiment_test.csv") print('Done.') # Separate labels and features X_test = df_test['Sentence'] y_test = df_test['Polarity'] # Instantiate classifier object classifier = ak.TextClassifier(max_trials=30, seed=42) # Clean up sentence data using custom tokenizer, convert datatypes for autokeras X_train_clean = np.array(X_train.apply(spacy_tokenizer_string), dtype=np.str) X_test_clean = np.array(X_test.apply(spacy_tokenizer_string), dtype=np.str) # Convert datatypes for compatibility with autokeras y_train_clean = np.array(y_train) y_test_clean = np.array(y_test) # Fit the autokeras classifier classifier.fit(X_train_clean, y_train_clean, epochs=5) # Extract the best model from search function. # Note that due to some bugs in autokeras, the best model needs to be extracted by pausing execution using debug mode # and recording the model layers and hyperparameters.
def get_auto_model(self): return ak.TextClassifier(max_trials=10, directory=self.tmp_dir, overwrite=True)
map(lambda sentence: ' '.join(id_to_word[i] for i in sentence), x_train)) x_test = list( map(lambda sentence: ' '.join(id_to_word[i] for i in sentence), x_test)) x_train = np.array(x_train, dtype=np.str) x_test = np.array(x_test, dtype=np.str) print(x_train.shape) # (25000,) print(y_train.shape) # (25000, 1) print(x_train[0][:50]) # <START> this film was just brilliant casting <UNK> """ The second step is to run the [TextClassifier](/text_classifier). """ import autokeras as ak # Initialize the text classifier. clf = ak.TextClassifier(overwrite=True, max_trials=1) # It tries 10 different models. # Feed the text classifier with training data. clf.fit(x_train, y_train, epochs=2) # Predict with the best model. predicted_y = clf.predict(x_test) # Evaluate the best model with testing data. print(clf.evaluate(x_test, y_test)) """ ## Validation Data By default, AutoKeras use the last 20% of training data as validation data. As shown in the example below, you can use `validation_split` to specify the percentage. """ clf.fit( x_train, y_train,
word_to_id = tf.keras.datasets.imdb.get_word_index() word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 id_to_word = {value: key for key, value in word_to_id.items()} x_train = list( map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_train)) x_test = list( map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_test)) x_train = np.array(x_train, dtype=np.str) x_test = np.array(x_test, dtype=np.str) return (x_train, y_train), (x_test, y_test) # Prepare the data. (x_train, y_train), (x_test, y_test) = imdb_raw() print(x_train.shape) # (25000,) print(y_train.shape) # (25000, 1) print(x_train[0][:50]) # <START> this film was just brilliant casting <UNK> # Initialize the TextClassifier clf = ak.TextClassifier(max_trials=3) # Search for the best model. clf.fit(x_train, y_train, epochs=2) # Evaluate on the testing data. print("Accuracy: {accuracy}".format(accuracy=clf.evaluate(x_test, y_test)))
def test_txt_clf_fit_call_auto_model_fit(fit, tmp_path): auto_model = ak.TextClassifier(directory=tmp_path, seed=utils.SEED) auto_model.fit(x=np.array(["a b c", "b b c"]), y=np.array([1, 2])) assert fit.is_called
def test_imdb_accuracy_over_84(tmp_path): (x_train, y_train), (x_test, y_test) = utils.imdb_raw(num_instances=None) clf = ak.TextClassifier(max_trials=2, directory=tmp_path) clf.fit(x_train, y_train, epochs=2) accuracy = clf.evaluate(x_test, y_test)[1] assert accuracy >= 0.84
def test_imdb_accuracy_over_92(tmp_path): (x_train, y_train), (x_test, y_test) = imdb_raw(num_instances=None) clf = ak.TextClassifier(max_trials=3, directory=tmp_path) clf.fit(x_train, y_train, batch_size=6, epochs=1) accuracy = clf.evaluate(x_test, y_test)[1] assert accuracy >= 0.92
def task_api(): (x_train, y_train), (x_test, y_test) = imdb_raw() clf = ak.TextClassifier(max_trials=3, seed=5) clf.fit(x_train, y_train, validation_split=0.2) return clf.evaluate(x_test, y_test)
def test_text_classifier(tmp_dir): (train_x, train_y), (test_x, test_y) = imdb_raw() clf = ak.TextClassifier(directory=tmp_dir, max_trials=2) clf.fit(train_x, train_y, epochs=2, validation_split=0.2) assert clf.predict(test_x).shape == (len(test_x), 1)
x_test = list( map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_test) ) x_train = np.array(x_train, dtype=np.str) x_test = np.array(x_test, dtype=np.str) return (x_train, y_train), (x_test, y_test) # Prepare the data. (x_train, y_train), (x_test, y_test) = reuters_raw() print(x_train.shape) # (8982,) print(y_train.shape) # (8982, 1) print(x_train[0][:50]) # <START> <UNK> <UNK> said as a result of its decemb # Initialize the TextClassifier clf = ak.TextClassifier( max_trials=5, overwrite=True, ) # Callback to avoid overfitting with the EarlyStopping. cbs = [ tf.keras.callbacks.EarlyStopping(patience=3), ] # Search for the best model. clf.fit(x_train, y_train, epochs=10, callback=cbs) # Evaluate on the testing data. print("Accuracy: {accuracy}".format(accuracy=clf.evaluate(x_test, y_test)))
def test_text_classifier(tmp_path): (train_x, train_y), (test_x, test_y) = utils.imdb_raw() clf = ak.TextClassifier(directory=tmp_path, max_trials=2, seed=utils.SEED) clf.fit(train_x, train_y, epochs=1, validation_data=(test_x, test_y)) clf.export_model() assert clf.predict(test_x).shape == (len(test_x), 1)
def test_text_classifier(tmp_dir): (train_x, train_y), (test_x, test_y) = common.imdb_raw() clf = ak.TextClassifier(directory=tmp_dir, max_trials=2, seed=common.SEED) clf.fit(train_x, train_y, epochs=1, validation_data=(test_x, test_y)) assert clf.predict(test_x).shape == (len(test_x), 1)
# batch_size=batch_size) # for x, y in train_data: # for i, a in enumerate(x.numpy()): # for j, b in enumerate(record_x): # if a == b: # print('*') # assert record_y[j] == y.numpy()[i] # import numpy as np # x_train = [] # y_train = [] # for x, y in train_data: # for a in x.numpy(): # x_train.append(a) # for a in y.numpy(): # y_train.append(a) # x_train = np.array(x_train) # y_train = np.array(y_train) # train_data = train_data.shuffle(1000, seed=123, reshuffle_each_iteration=False) clf = ak.TextClassifier(overwrite=True, max_trials=2) # clf.fit(train_data, validation_data=test_data) # clf.fit(train_data, validation_data=train_data) clf.fit(train_data, validation_data=val_data) # clf.fit(x_train, y_train) # clf.fit(train_data) print(clf.evaluate(test_data))