def run(df): glove_dir = basepath + "glove.6B" model_name = "books_sup" max_sentence_length = 100 max_sentences = 15 max_words = 20000 embedding_dim = 100 X = df["text"] y = df["label"] y_true = df["label"] labels, label_to_index, index_to_label = get_distinct_labels(df) y_one_hot = make_one_hot(y, label_to_index) # y = np.array(y) print("Fitting tokenizer...") tokenizer = fit_get_tokenizer(X, max_words) # print("Getting tokenizer") # tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) print("Splitting into train, dev...") X_train, y_train, X_val, y_val, X_test, y_test = create_train_dev_test( X, labels=y_one_hot, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) print("Creating Embedding matrix...") embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim) # print("Getting Embedding matrix...") # embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es]) print("****************** CLASSIFICATION REPORT ********************") pred = model.predict(X_test) true_labels = get_from_one_hot(y_test, index_to_label) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(true_labels, pred_labels))
def train_classifier(df, labels, label_term_dict, label_adult_dict, label_actor_dict, label_actress_dict, label_producer_dict, label_writer_dict, label_director_dict, label_composer_dict, label_cinematographer_dict, label_editor_dict, label_prod_designer_dict, label_dir_adult_dict, label_dir_actor_dict, label_dir_actress_dict, label_dir_producer_dict, label_dir_writer_dict, label_dir_composer_dict, label_dir_cinematographer_dict, label_dir_editor_dict, label_dir_prod_designer_dict, label_actor_actress_dict, label_to_index, index_to_label, model_name, soft=False): basepath = "/data4/dheeraj/metaguide/" dataset = "imdb/" # glove_dir = basepath + "glove.6B" dump_dir = basepath + "models/" + dataset + model_name + "/" tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 embedding_dim = 100 tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) X, y, y_true = get_train_data(df, labels, label_term_dict, label_adult_dict, label_actor_dict, label_actress_dict, label_producer_dict, label_writer_dict, label_director_dict, label_composer_dict, label_cinematographer_dict, label_editor_dict, label_prod_designer_dict, label_dir_adult_dict, label_dir_actor_dict, label_dir_actress_dict, label_dir_producer_dict, label_dir_writer_dict, label_dir_composer_dict, label_dir_cinematographer_dict, label_dir_editor_dict, label_dir_prod_designer_dict, label_actor_actress_dict, tokenizer, label_to_index, soft=soft) print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************") # df_train = create_training_df(X, y, y_true) # df_train.to_csv(basepath + dataset + "training_label.csv") if not soft: y_vec = make_one_hot(y, label_to_index) print(classification_report(y_true, y)) else: y_vec = np.array(y) y_argmax = np.argmax(y, axis=-1) y_str = [] for i in y_argmax: y_str.append(index_to_label[i]) print(classification_report(y_true, y_str)) # print("Fitting tokenizer...") # tokenizer = fit_get_tokenizer(X, max_words) print("Getting tokenizer") tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) print("Splitting into train, dev...") X_train, y_train, X_val, y_val, _, _ = create_train_dev(X, labels=y_vec, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words, val=False) # print("Creating Embedding matrix...") # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim) print("Getting Embedding matrix...") embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() if not soft: model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) else: model.compile(loss=kullback_leibler_divergence, optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc]) # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************") # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length, # tokenizer=tokenizer) # pred = model.predict(X_label_all) # pred_labels = get_from_one_hot(pred, index_to_label) # print(classification_report(y_true, pred_labels)) print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************") X_all = prep_data(texts=df["text"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(y_true_all, pred_labels)) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels, pred
def train_classifier(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict, label_author_pub_dict, label_pub_year_dict, label_author_year_dict, label_to_index, index_to_label, model_name, clf, use_gpu, old=True, soft=False): basepath = "/data4/dheeraj/metaguide/" dataset = "books/" # glove_dir = basepath + "glove.6B" dump_dir = basepath + "models/" + dataset + model_name + "/" tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 embedding_dim = 100 tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) if old: X, y, y_true = get_train_data(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict, label_author_pub_dict, label_pub_year_dict, label_author_year_dict, tokenizer, label_to_index, soft=soft, clf=clf) if clf == "BERT": df_orig = pickle.load(open(basepath + dataset + "df.pkl", "rb")) X = list(df_orig.iloc[X]["text"]) else: X, y, y_true = get_confident_train_data(df, labels, label_term_dict, label_author_dict, label_pub_dict, label_year_dict, label_author_pub_dict, label_pub_year_dict, label_author_year_dict, tokenizer) print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************") # df_train = create_training_df(X, y, y_true) # df_train.to_csv(basepath + dataset + "training_label.csv") if not soft: y_vec = make_one_hot(y, label_to_index) print(classification_report(y_true, y)) else: y_vec = np.array(y) y_argmax = np.argmax(y, axis=-1) y_str = [] for i in y_argmax: y_str.append(index_to_label[i]) print(classification_report(y_true, y_str)) # print("Fitting tokenizer...") # tokenizer = fit_get_tokenizer(X, max_words) print("Getting tokenizer") tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) # print("Creating Embedding matrix...") # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim) if clf == "HAN": print("Splitting into train, dev...") X_train, y_train, X_val, y_val, _, _ = create_train_dev(X, labels=y_vec, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words, val=False) print("Getting Embedding matrix...") embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() if not soft: model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) else: model.compile(loss=kullback_leibler_divergence, optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc]) # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************") # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length, # tokenizer=tokenizer) # pred = model.predict(X_label_all) # pred_labels = get_from_one_hot(pred, index_to_label) # print(classification_report(y_true, pred_labels)) print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************") X_all = prep_data(texts=df["text"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") elif clf == "BERT": y_vec = [] for lbl_ in y: y_vec.append(label_to_index[lbl_]) model = train_bert(X, y_vec, use_gpu) y_true_all = [] for lbl_ in df.label: y_true_all.append(label_to_index[lbl_]) predictions = test(model, df_orig["text"], y_true_all, use_gpu) for i, p in enumerate(predictions): if i == 0: pred = p else: pred = np.concatenate((pred, p)) pred_labels = [] for p in pred: pred_labels.append(index_to_label[p.argmax(axis=-1)]) y_true_all = df["label"] elif clf == "CNN": y_vec = [] for lbl_ in y: y_vec.append(label_to_index[lbl_]) y_true_all = [] for lbl_ in df.label: y_true_all.append(label_to_index[lbl_]) pred_idxs, pred, true_idxs = train_cnn(X, y_vec, df["text"], y_true_all, use_gpu) pred_labels = [] for p in pred_idxs: pred_labels.append(index_to_label[p]) y_true_all = [] for p in true_idxs: y_true_all.append(index_to_label[p]) else: raise ValueError("clf can only be HAN or BERT or CNN") print(classification_report(y_true_all, pred_labels)) return pred_labels, pred
labels=y_one_hot, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) # print("Creating Embedding matrix...") # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim) print("Getting Embedding matrix...") embedding_matrix = pickle.load( open(basepath + dataset + "embedding_matrix_topk_dict.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1,
def train_classifier(df, labels, label_term_dict, label_to_index, index_to_label, dataset_path): print("Going to train classifier..") basepath = dataset_path model_name = "conwea" dump_dir = basepath + "models/" + model_name + "/" tmp_dir = basepath + "checkpoints/" + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 tokenizer = pickle.load(open(dataset_path + "tokenizer.pkl", "rb")) X, y, y_true = generate_pseudo_labels(df, labels, label_term_dict, tokenizer) y_one_hot = make_one_hot(y, label_to_index) print("Fitting tokenizer...") print("Splitting into train, dev...") X_train, y_train, X_val, y_val = create_train_dev( X, labels=y_one_hot, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) print("Creating Embedding matrix...") embedding_matrix = pickle.load( open(dataset_path + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=256, callbacks=[es, mc]) print( "****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************" ) X_all = prep_data(texts=df["sentence"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(y_true_all, pred_labels)) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels
def train_weight_classifier(df, labels, label_term_dict, label_author_dict, label_conf_dict, label_to_index, index_to_label, model_name, AND=True): basepath = "/data4/dheeraj/metaguide/" dataset = "dblp/" # glove_dir = basepath + "glove.6B" dump_dir = basepath + "models/" + dataset + model_name + "/" tmp_dir = basepath + "checkpoints/" + dataset + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 embedding_dim = 100 tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) X, y, y_true, weights = get_weighted_train_data(df, labels, label_term_dict, label_author_dict, label_conf_dict, tokenizer, label_to_index, AND=AND) print("****************** CLASSIFICATION REPORT FOR TRAINING DATA ********************") # df_train = create_training_df(X, y, y_true) # df_train.to_csv(basepath + dataset + "training_label.csv") y_vec = make_one_hot(y, label_to_index) print(classification_report(y_true, y)) # y = np.array(y) # print("Fitting tokenizer...") # tokenizer = fit_get_tokenizer(X, max_words) print("Getting tokenizer") tokenizer = pickle.load(open(basepath + dataset + "tokenizer.pkl", "rb")) print("Splitting into train, dev...") X_train, y_train, X_val, y_val, weights_train, _ = create_train_dev_weights(X, labels=y_vec, weights=weights, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) # print("Creating Embedding matrix...") # embedding_matrix = create_embedding_matrix(glove_dir, tokenizer, embedding_dim) print("Getting Embedding matrix...") embedding_matrix = pickle.load(open(basepath + dataset + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc], sample_weight=np.array(weights_train)) # print("****************** CLASSIFICATION REPORT FOR DOCUMENTS WITH LABEL WORDS ********************") # X_label_all = prep_data(texts=X, max_sentences=max_sentences, max_sentence_length=max_sentence_length, # tokenizer=tokenizer) # pred = model.predict(X_label_all) # pred_labels = get_from_one_hot(pred, index_to_label) # print(classification_report(y_true, pred_labels)) print("****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************") X_all = prep_data(texts=df["abstract"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(y_true_all, pred_labels)) print("Dumping the model...") model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels, pred
# Loop though all the words in the word_index and where possible # replace the random initalization with the GloVe vector. for word, index in tqdm(word_tokenizer.word_index.items()): embedding_vector = embeddings.get(word) if embedding_vector is not None: embedding_matrix[index] = embedding_vector ##################################################### # Model Training # ##################################################### logger.info("Training the model.") han_model = HAN( MAX_WORDS_PER_SENT, MAX_SENT, embedding_matrix, word_encoding_dim=100, sentence_encoding_dim=100, ) loss = tf.keras.losses.BinaryCrossentropy(name="loss") # loss = WeightedBinaryCrossEntropy(pos_weight=442475 / 89972, name="loss") opt = tf.keras.optimizers.Adam(learning_rate=0.001) han_model.compile( optimizer=opt, loss=loss, metrics=[ tf.keras.metrics.BinaryAccuracy(name="acc"), tf.keras.metrics.AUC(name="auc"), tf.keras.metrics.AUC(name="pr_auc", curve="PR"), ],
# Loop though all the words in the word_index and where possible # replace the random initalization with the GloVe vector. for word, index in word_tokenizer.word_index.items(): embedding_vector = embeddings.get(word) if embedding_vector is not None: embedding_matrix[index] = embedding_vector ##################################################### # Model Training # ##################################################### logger.info("Training the model.") han_model = HAN(MAX_WORDS_PER_SENT, MAX_SENT, 2, embedding_matrix, word_encoding_dim=100, sentence_encoding_dim=100) han_model.summary() han_model.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['acc']) checkpoint_saver = ModelCheckpoint( filepath='./tmp/model.{epoch:02d}-{val_loss:.2f}.hdf5', verbose=1, save_best_only=True) han_model.fit(X_train,
def train_classifier(df, labels, label_term_dict, label_to_index, index_to_label, dataset_path): print("Going to train classifier..") basepath = dataset_path model_name = "conwea" dump_dir = basepath + "models/" + model_name + "/" tmp_dir = basepath + "checkpoints/" + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 #TODO what is max sentences??? max_sentences = 15 max_words = 20000 tokenizer = pickle.load(open(dataset_path + "tokenizer.pkl", "rb")) X, y, y_true = generate_pseudo_labels(df, labels, label_term_dict, tokenizer) #y_one_hot = make_one_hot(y, label_to_index) y_one_hot = np.array(y) #code too see distribution of labels twodmatrix = np.stack(y, axis=0) labelcounts = np.sum(twodmatrix, axis=0) plt.bar(range(0, 13), labelcounts) plt.title('PSEUDOLABEL DISTRIBUTION') plt.show() print("Fitting tokenizer...") print("Splitting into train, dev...") X_train, y_train, X_val, y_val = create_train_dev( X, labels=y_one_hot, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) print("Creating Embedding matrix...") embedding_matrix = pickle.load( open(dataset_path + "embedding_matrix.pkl", "rb")) print("Initializing model...") model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...") model.summary() model.compile(loss="binary_crossentropy", optimizer='adam', metrics=[TopKCategoricalAccuracy(k=3)]) print("model fitting - Hierachical attention network...") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor=TopKCategoricalAccuracy(k=3), mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=1, batch_size=256, callbacks=[es, mc]) print( "****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************" ) X_all = prep_data(texts=df["sentence"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] #pred now is an array as long as the classes pred = model.predict(X_all) #i need to convert this to binary 0,1 array #code to see prediction distribution twodmatrix = np.stack(y, axis=0) labelcounts = np.sum(twodmatrix, axis=0) plt.bar(range(0, 13), labelcounts) plt.title('NN PREDICTION DISTRIBUTION') plt.show() # one-hot-encoding of predictions based on >0,5> thresh for recall and accuracy lsprecrec = (pred > 0.5).astype(int) #array of strings of predicted labels( with hard threshold for seeding words) #pred usualy. trying lsprecrec for lower threshold pred_labels = get_from_one_hot(lsprecrec, index_to_label) y_true_allnp = np.array(y_true_all) #this is to fix the error of different dimensions y_true_allnp = np.array([np.array(x) for x in y_true_allnp]) from sklearn.metrics import confusion_matrix for i, l in enumerate(label_to_index.keys()): if sum(y_true_allnp.T[i]) == 0: print('no {l} in dataset') if sum(lsprecrec.T[i]) == 0: print("no {} ever predicted".format(l)) tn, fp, fn, tp = confusion_matrix(y_true_allnp.T[i], lsprecrec.T[i]).ravel() precision = tp / (tp + fp) recall = tp / (tp + fn) print('{} : precision {}, recall: {}'.format(l, precision, recall)) topk1_accuracypseudo = TopKCategoricalAccuracy( k=1, name="top_k1_categorical_accuracy", dtype=None) topk2_accuracypseudo = TopKCategoricalAccuracy( k=2, name="top_k2_categorical_accuracy", dtype=None) topk3_accuracypseudo = TopKCategoricalAccuracy( k=3, name="top_k3_categorical_accuracy", dtype=None) topk1_accuracypseudo.update_state(y_true=y_true, y_pred=y_one_hot) topk2_accuracypseudo.update_state(y_true=y_true, y_pred=y_one_hot) topk3_accuracypseudo.update_state(y_true=y_true, y_pred=y_one_hot) print("ACCURACY PSEUDO LABELS") print("K1: ", topk1_accuracypseudo.result().numpy()) print("K2: ", topk2_accuracypseudo.result().numpy()) print("K3: ", topk3_accuracypseudo.result().numpy()) #keras top-k accuracy on nn prediction topk1_accuracy = TopKCategoricalAccuracy( k=1, name="top_k1_categorical_accuracy", dtype=None) topk2_accuracy = TopKCategoricalAccuracy( k=2, name="top_k2_categorical_accuracy", dtype=None) topk3_accuracy = TopKCategoricalAccuracy( k=3, name="top_k3_categorical_accuracy", dtype=None) topk1_accuracy.update_state(y_true=y_true_allnp.astype(np.float64), y_pred=pred) topk2_accuracy.update_state(y_true=y_true_allnp.astype(np.float64), y_pred=pred) topk3_accuracy.update_state(y_true=y_true_allnp.astype(np.float64), y_pred=pred) print("ACCURACY NN PREDICTION") print("K1: ", topk1_accuracy.result().numpy()) print("K2: ", topk2_accuracy.result().numpy()) print("K3: ", topk3_accuracy.result().numpy()) #print(classification_report(y_true_all, pred_labels)) print("Dumping the model...") # model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") # model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels
def train_classifier(df, tokenizer, embedding_matrix, labels, motpat_label_motifs_dict, label_to_index, index_to_label, index_word, dataset_path, config): def generate_pseudo_labels(df, labels, motpat_label_motifs_dict, tokenizer, index_word, config): y = [] X = [] for index, row in df.iterrows(): count_dict = {} flag = 0 for mot_pat in motpat_label_motifs_dict: label_motifs_dict = motpat_label_motifs_dict[mot_pat] if len(label_motifs_dict) == 0: continue if mot_pat == "phrase": tokens = tokenizer.texts_to_sequences([row["text"]])[0] words = [] for tok in tokens: words.append(index_word[tok]) for l in labels: if len(label_motifs_dict[l]) == 0: continue seed_words = set(label_motifs_dict[l].keys()) int_words = list(set(words).intersection(seed_words)) for word in int_words: flag = 1 try: count_dict[l] += label_motifs_dict[l][word] except: count_dict[l] = label_motifs_dict[l][word] else: size = len(mot_pat) if size == 1: first = mot_pat[0] entities = get_entity_from_col(row[first], first, config) elif size == 2: first = mot_pat[0] second = mot_pat[1] first_ents = get_entity_from_col( row[first], first, config) second_ents = get_entity_from_col( row[second], second, config) if first == second: entities = set( itertools.combinations(first_ents, 2)) else: entities = set( itertools.product(first_ents, second_ents)) else: raise Exception( "Motif patterns of size more than 2 not yet handled but can be easily extended." ) for l in labels: if len(label_motifs_dict[l]) == 0: continue seed_entities = set(label_motifs_dict[l].keys()) int_ents = list(entities.intersection(seed_entities)) for ent in int_ents: flag = 1 try: count_dict[l] += label_motifs_dict[l][ent] except: count_dict[l] = label_motifs_dict[l][ent] if flag: lbl = max(count_dict, key=count_dict.get) if not lbl: continue y.append(lbl) X.append(row["text"]) return X, y basepath = dataset_path model_name = "meta" dump_dir = basepath + "models/" + model_name + "/" tmp_dir = basepath + "checkpoints/" + model_name + "/" os.makedirs(dump_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) max_sentence_length = 100 max_sentences = 15 max_words = 20000 print("Generating pseudo-labels", flush=True) X, y = generate_pseudo_labels(df, labels, motpat_label_motifs_dict, tokenizer, index_word, config) y_vec = make_one_hot(y, label_to_index) print("Splitting into train, dev...", flush=True) X_train, y_train, X_val, y_val = create_train_dev( X, labels=y_vec, tokenizer=tokenizer, max_sentences=max_sentences, max_sentence_length=max_sentence_length, max_words=max_words) print("Initializing model...", flush=True) model = HAN(max_words=max_sentence_length, max_sentences=max_sentences, output_size=len(y_train[0]), embedding_matrix=embedding_matrix) print("Compiling model...", flush=True) model.summary() model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['acc']) print("model fitting - Hierachical attention network...", flush=True) es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3) mc = ModelCheckpoint(filepath=tmp_dir + 'model.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_acc', mode='max', verbose=1, save_weights_only=True, save_best_only=True) model.fit(X_train, y_train, validation_data=(X_val, y_val), nb_epoch=100, batch_size=256, callbacks=[es, mc]) print( "****************** CLASSIFICATION REPORT FOR All DOCUMENTS ********************", flush=True) X_all = prep_data(texts=df["text"], max_sentences=max_sentences, max_sentence_length=max_sentence_length, tokenizer=tokenizer) y_true_all = df["label"] pred = model.predict(X_all) pred_labels = get_from_one_hot(pred, index_to_label) print(classification_report(y_true_all, pred_labels), flush=True) print("Dumping the model...", flush=True) model.save_weights(dump_dir + "model_weights_" + model_name + ".h5") model.save(dump_dir + "model_" + model_name + ".h5") return pred_labels, pred