Y_test = np.load("Objects/Y_test.npy") Y_test[Y_test >= 0.5] = 1 Y_test[Y_test < 0.5] = 0 Y_train = (Y_train - np.mean(Y_train, axis=0, keepdims=True)) / np.std( Y_train, axis=0, keepdims=True) Y_test = (Y_test - np.mean(Y_test, axis=0, keepdims=True)) / np.std( Y_test, axis=0, keepdims=True) R_train = Y_train.T.dot(Y_train) / Y_train.shape[0] R_test = Y_test.T.dot(Y_test) / Y_test.shape[0] labels = pickle.load("Objects/labels.list") file = open("Out/corrs.txt", "w") file.write( "Correlations in training set and in test predictions respectively:\n\n") D = {} for i in range(R_train.shape[0]): for j in range(i + 1, R_train.shape[1]): D.update({labels[i] + ", " + labels[j]: [R_train[i, j], R_test[i, j]]}) file.write(labels[i] + ", " + labels[j] + ": " + str(np.round(R_train[i, j], 5)) + ", " + str(np.round(R_test[i, j], 5)) + "\n") file.close() pickle.save("Objects/corrs.dict", D)
x[i, :] = ft_model.get_word_vector(word).astype('float32') return x def df_to_data(df): """ Convert a given dataframe to a dataset of inputs for the NN. """ x = np.zeros((len(df), window_length, n_features), dtype='float32') for i, comment in enumerate(df['comment_text'].values): x[i, :] = text_to_vector(comment) return x print("Building embedded data . . . ") X_train = df_to_data(train) X_test = df_to_data(test) labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] Y_train = train[labels].values print("Saving embedded data, target values, and labels . . . ") np.save("Objects/X_train.npy", X_train) np.save("Objects/X_test.npy", X_test) np.save("Objects/Y_train.npy", Y_train) pickle.save("Objects/labels.list", labels) print("Done!")
num_tasks = Y_train.shape[1] time_1 = time.time() classifier = OneVsRestClassifier(LinearSVC()) classifier.fit(X_train, Y_train) time_2 = time.time() results = {} results.update({"train_time": time_2-time_1}) pred = classifier.predict(X_train) results.update({"acc": np.array([accuracy_score(Y_train[:,i], pred[:,i]) for i in range(num_tasks)])}) results.update({"acc_mean": np.mean(results["acc"])}) results.update({"roc": np.array([roc_auc_score(Y_train[:,i], pred[:,i]) for i in range(num_tasks)])}) results.update({"roc_mean": np.mean(results["roc"])}) pred = classifier.predict(X_test) results.update({"val_acc": np.array([accuracy_score(Y_test[:,i], pred[:,i]) for i in range(num_tasks)])}) results.update({"val_acc_mean": np.mean(results["val_acc"])}) results.update({"val_roc": np.array([roc_auc_score(Y_test[:,i], pred[:,i]) for i in range(num_tasks)])}) results.update({"val_roc_mean": np.mean(results["val_roc"])}) #nrows = 10 X_test = sp.sparse.load_npz("Objects/X_test.npz") # [:nrows,:X_train.shape[1]] Y_test = classifier.predict(X_test) submission = pd.read_csv("../Data/sample_submission.csv") # , nrows=nrows) submission.iloc[:,1:] = Y_test submission.to_csv("Objects/submission.csv", index=False) results.update({"script_time": time.time()-time_0}) pickle.save("Objects/results.dict", results)
import numpy as np from Utils import pickle Y_train = np.load("Objects/Y_train.npy") Y_test = np.load("Objects/Y_test.npy") Y_test[Y_test>=0.5] = 1 Y_test[Y_test<0.5] = 0 p_train = np.mean(Y_train, axis=0) p_test = np.mean(Y_test, axis=0) labels = pickle.load("Objects/labels.list") file = open("Out/imbalance.txt", "w") file.write("Percentage of comments toxic in training set and in test predictions respectively:\n\n") D = {} for i in range(p_train.shape[0]): D.update({labels[i]: [p_train[i], p_test[i]]}) file.write(labels[i]+": "+str(np.round(p_train[i], 5))+", "+str(np.round(p_test[i], 5))+"\n") file.close() pickle.save("Objects/imbalance.dict", D)
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from Utils import pickle data = pd.read_csv("../Data/train.csv") comments = data["comment_text"].tolist() vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=200000, min_df=5) X_train = vectorizer.fit_transform(comments) sp.sparse.save_npz("Objects/X_train.npz", X_train) Y_train = data.iloc[:, 2:].as_matrix() np.save("Objects/Y_train.npy", Y_train) data = pd.read_csv("../Data/test.csv") comments = data["comment_text"].tolist() X_test = vectorizer.transform(comments) sp.sparse.save_npz("Objects/X_test.npz", X_test) pickle.save("Objects/tdidf.tok", vectorizer) labels = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] pickle.save("Objects/labels.list", labels)