def _prepare_data(self, data_path, test_size, random_state): """Loads data and prepares for training Args: data_path (str): File path to the data test_size (float): Percent of the data to use for the test set random_state (int): Seed for randomly splitting data for train and test sets """ ct = CleanText() df = pd.read_pickle(data_path) df = df[df['issue'] != ''] df['clean_text'] = df['ticket_text'].apply( lambda x: ct.prepare_text(x)) weights = self._weights_helper(df['issue']) trainLines, trainLabels = df['clean_text'], df['issue'] labels = pd.get_dummies(trainLabels) X_train, X_test, y_train, y_test = train_test_split( trainLines, labels, test_size=test_size, random_state=random_state, stratify=labels) encoder = EncodeText() length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train) X_test = encoder.encode_text(X_test, test_data=True) self.weights = weights self.labels = labels self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.length = length self.vocab_size = vocab_size
from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, MaxPooling1D, Conv1D, concatenate from tensorflow.keras import metrics from sklearn.preprocessing import LabelEncoder from tensorflow.keras.utils import to_categorical from sklearn.model_selection import train_test_split from numpy import array import pandas as pd df = pd.read_pickle('./data.pkl') ct = CleanText() encoder = EncodeText() df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')] df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x)) trainLines, trainLabels = df['clean_text'], df['issue'] lb = LabelEncoder() transformed_labels = lb.fit_transform(trainLabels) transformed_labels = to_categorical(transformed_labels) X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels) length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train) X_test = encoder.encode_text(X_test, test_data=True)
from tensorflow.keras.preprocessing.sequence import pad_sequences from numpy import array import numpy as np from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D, concatenate from tensorflow.keras.layers import Bidirectional, GRU from tensorflow.keras.models import Sequential from tensorflow.keras.metrics import AUC from tensorflow.keras.utils import to_categorical from sklearn.model_selection import train_test_split from sklearn.utils import compute_class_weight df = pd.read_pickle('./data.pkl') clean = CleanText() df['clean_text'] = df['ticket_text'].apply(lambda x: clean.prepare_text(x)) def create_tokenizer(lines): tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer def max_length(lines): return max([len(s.split()) for s in lines]) def encode_text(tokenizer, lines, length): encoded = tokenizer.texts_to_sequences(lines) padded = pad_sequences(encoded, maxlen=length, padding='post')
encoder.load_encoder('./encoder_files/encoder.pkl') encoder.load_encoder_variables('./encoder_files/encoder_variables.json') X_test = encoder.encode_text(X_test, test_data=True) cnn = load_model('./model_files/cnn_classification_model.h5') rnn = load_model('./model_files/rnn_classification_model.h5') hybrid = load_model('./model_files/hybrid_attention_classification_model.h5') clean = CleanText() test_text = ['''I cant get my morgan stanley account to connect to EveryDollar. If I cant get it to connect, Im going to need to get a refund. Its the only value I get from the app'''] tt = [clean.prepare_text(t) for t in test_text] tt = encoder.encode_text(tt, test_data=True) cnn_res = cnn.predict(tt) y_test.columns[np.argmax(cnn_res)] rnn_res = rnn.predict(tt) y_test.columns[np.argmax(rnn_res)] hybrid_res = hybrid.predict(tt) y_test.columns[np.argmax(hybrid_res)] cnn_res = cnn.predict(X_test) cnn_res_t = (cnn_res > .5)