def _prepare_data(self, data_path, test_size, random_state): """Loads data and prepares for training Args: data_path (str): File path to the data test_size (float): Percent of the data to use for the test set random_state (int): Seed for randomly splitting data for train and test sets """ ct = CleanText() df = pd.read_pickle(data_path) df = df[df['issue'] != ''] df['clean_text'] = df['ticket_text'].apply( lambda x: ct.prepare_text(x)) weights = self._weights_helper(df['issue']) trainLines, trainLabels = df['clean_text'], df['issue'] labels = pd.get_dummies(trainLabels) X_train, X_test, y_train, y_test = train_test_split( trainLines, labels, test_size=test_size, random_state=random_state, stratify=labels) encoder = EncodeText() length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train) X_test = encoder.encode_text(X_test, test_data=True) self.weights = weights self.labels = labels self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.length = length self.vocab_size = vocab_size
from encode_text import EncodeText from clean_text import CleanText from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, MaxPooling1D, Conv1D, concatenate from tensorflow.keras import metrics from sklearn.preprocessing import LabelEncoder from tensorflow.keras.utils import to_categorical from sklearn.model_selection import train_test_split from numpy import array import pandas as pd df = pd.read_pickle('./data.pkl') ct = CleanText() encoder = EncodeText() df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')] df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x)) trainLines, trainLabels = df['clean_text'], df['issue'] lb = LabelEncoder() transformed_labels = lb.fit_transform(trainLabels) transformed_labels = to_categorical(transformed_labels) X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels) length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train)
from clean_text import CleanText from encode_text import EncodeText import uvicorn import numpy as np import pickle app = FastAPI() class StockIn(BaseModel): ticket_text: str class StockOut(BaseModel): result: str #encode text class encoder = EncodeText() encoder.load_encoder_variables('./encoder_variables.json') #clean text class ct = CleanText() #labels label_binarizer_path = './label_binarizer.pkl' with open(label_binarizer_path, 'rb') as handle: lb = pickle.load(handle) labels = list(lb.classes_) model = load_model('./classification_model.h5')
import pandas as pd from numpy import array from sklearn.preprocessing import LabelBinarizer from sklearn.model_selection import train_test_split from clean_text import CleanText from encode_text import EncodeText from conv_net import ConvNet df = pd.read_pickle('./data.pkl') ct = CleanText() encoder = EncodeText() df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x)) model = ConvNet() trainLines, trainLabels = df['clean_text'], df['issue'] lb = LabelBinarizer() transformed_labels = lb.fit_transform(trainLabels) X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels) length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train) X_test = encoder.encode_text(X_test, test_data=True) encoder.save_encoder('./encoder.pkl') encoder.save_encoder_variables('./encoder_variables')
from tensorflow.keras.models import load_model import pandas as pd from clean_text import CleanText from encode_text import EncodeText import numpy as np from sklearn.metrics import f1_score, roc_auc_score df = pd.read_pickle('./test_data.pkl') X_test, y_test = df['clean_text'], df.loc[:, df.columns != 'clean_text'] encoder = EncodeText() encoder.load_encoder('./encoder_files/encoder.pkl') encoder.load_encoder_variables('./encoder_files/encoder_variables.json') X_test = encoder.encode_text(X_test, test_data=True) cnn = load_model('./model_files/cnn_classification_model.h5') rnn = load_model('./model_files/rnn_classification_model.h5') hybrid = load_model('./model_files/hybrid_attention_classification_model.h5') clean = CleanText() test_text = ['''I cant get my morgan stanley account to connect to EveryDollar. If I cant get it to connect, Im going to need to get a refund. Its the only value I get from the app'''] tt = [clean.prepare_text(t) for t in test_text] tt = encoder.encode_text(tt, test_data=True) cnn_res = cnn.predict(tt)