def _prepare_data(self, data_path, test_size, random_state): """Loads data and prepares for training Args: data_path (str): File path to the data test_size (float): Percent of the data to use for the test set random_state (int): Seed for randomly splitting data for train and test sets """ ct = CleanText() df = pd.read_pickle(data_path) df = df[df['issue'] != ''] df['clean_text'] = df['ticket_text'].apply( lambda x: ct.prepare_text(x)) weights = self._weights_helper(df['issue']) trainLines, trainLabels = df['clean_text'], df['issue'] labels = pd.get_dummies(trainLabels) X_train, X_test, y_train, y_test = train_test_split( trainLines, labels, test_size=test_size, random_state=random_state, stratify=labels) encoder = EncodeText() length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train) X_test = encoder.encode_text(X_test, test_data=True) self.weights = weights self.labels = labels self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.length = length self.vocab_size = vocab_size
from encode_text import EncodeText from clean_text import CleanText from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, MaxPooling1D, Conv1D, concatenate from tensorflow.keras import metrics from sklearn.preprocessing import LabelEncoder from tensorflow.keras.utils import to_categorical from sklearn.model_selection import train_test_split from numpy import array import pandas as pd df = pd.read_pickle('./data.pkl') ct = CleanText() encoder = EncodeText() df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')] df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x)) trainLines, trainLabels = df['clean_text'], df['issue'] lb = LabelEncoder() transformed_labels = lb.fit_transform(trainLabels) transformed_labels = to_categorical(transformed_labels) X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels) length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train)