Example #1
0
    def _prepare_data(self, data_path, test_size, random_state):
        """Loads data and prepares for training

        Args:
            data_path (str): File path to the data
            test_size (float): Percent of the data to use for the test set
            random_state (int): Seed for randomly splitting data for train and test sets
        """
        ct = CleanText()

        df = pd.read_pickle(data_path)
        df = df[df['issue'] != '']

        df['clean_text'] = df['ticket_text'].apply(
            lambda x: ct.prepare_text(x))

        weights = self._weights_helper(df['issue'])

        trainLines, trainLabels = df['clean_text'], df['issue']
        labels = pd.get_dummies(trainLabels)

        X_train, X_test, y_train, y_test = train_test_split(
            trainLines,
            labels,
            test_size=test_size,
            random_state=random_state,
            stratify=labels)

        encoder = EncodeText()
        length = encoder.max_length(X_train)
        vocab_size = encoder.vocab_size(X_train)
        X_train = encoder.encode_text(X_train)
        X_test = encoder.encode_text(X_test, test_data=True)

        self.weights = weights
        self.labels = labels
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.length = length
        self.vocab_size = vocab_size
Example #2
0
from encode_text import EncodeText
from clean_text import CleanText
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, MaxPooling1D, Conv1D, concatenate
from tensorflow.keras import metrics
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from numpy import array
import pandas as pd

df = pd.read_pickle('./data.pkl')
ct = CleanText()
encoder = EncodeText()

df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')]

df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x))

trainLines, trainLabels = df['clean_text'], df['issue']

lb = LabelEncoder()
transformed_labels = lb.fit_transform(trainLabels)
transformed_labels = to_categorical(transformed_labels)

X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels)


length = encoder.max_length(X_train)
vocab_size = encoder.vocab_size(X_train)
X_train = encoder.encode_text(X_train)