Example #1
0
    def _prepare_data(self, data_path, test_size, random_state):
        """Loads data and prepares for training

        Args:
            data_path (str): File path to the data
            test_size (float): Percent of the data to use for the test set
            random_state (int): Seed for randomly splitting data for train and test sets
        """
        ct = CleanText()

        df = pd.read_pickle(data_path)
        df = df[df['issue'] != '']

        df['clean_text'] = df['ticket_text'].apply(
            lambda x: ct.prepare_text(x))

        weights = self._weights_helper(df['issue'])

        trainLines, trainLabels = df['clean_text'], df['issue']
        labels = pd.get_dummies(trainLabels)

        X_train, X_test, y_train, y_test = train_test_split(
            trainLines,
            labels,
            test_size=test_size,
            random_state=random_state,
            stratify=labels)

        encoder = EncodeText()
        length = encoder.max_length(X_train)
        vocab_size = encoder.vocab_size(X_train)
        X_train = encoder.encode_text(X_train)
        X_test = encoder.encode_text(X_test, test_data=True)

        self.weights = weights
        self.labels = labels
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.length = length
        self.vocab_size = vocab_size
Example #2
0
from encode_text import EncodeText
from clean_text import CleanText
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, MaxPooling1D, Conv1D, concatenate
from tensorflow.keras import metrics
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from numpy import array
import pandas as pd

df = pd.read_pickle('./data.pkl')
ct = CleanText()
encoder = EncodeText()

df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')]

df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x))

trainLines, trainLabels = df['clean_text'], df['issue']

lb = LabelEncoder()
transformed_labels = lb.fit_transform(trainLabels)
transformed_labels = to_categorical(transformed_labels)

X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels)


length = encoder.max_length(X_train)
vocab_size = encoder.vocab_size(X_train)
X_train = encoder.encode_text(X_train)
Example #3
0
from clean_text import CleanText
from encode_text import EncodeText
import uvicorn
import numpy as np
import pickle

app = FastAPI()

class StockIn(BaseModel):
    ticket_text: str

class StockOut(BaseModel):
    result: str

#encode text class
encoder = EncodeText()

encoder.load_encoder_variables('./encoder_variables.json')

#clean text class
ct = CleanText()

#labels
label_binarizer_path = './label_binarizer.pkl'
with open(label_binarizer_path, 'rb') as handle:
    lb = pickle.load(handle)

labels = list(lb.classes_)

model = load_model('./classification_model.h5')
Example #4
0
import pandas as pd
from numpy import array
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from clean_text import CleanText
from encode_text import EncodeText
from conv_net import ConvNet

df = pd.read_pickle('./data.pkl')
ct = CleanText()
encoder = EncodeText()

df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x))

model = ConvNet()

trainLines, trainLabels = df['clean_text'], df['issue']

lb = LabelBinarizer()
transformed_labels = lb.fit_transform(trainLabels)

X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels)


length = encoder.max_length(X_train)
vocab_size = encoder.vocab_size(X_train)
X_train = encoder.encode_text(X_train)
X_test = encoder.encode_text(X_test, test_data=True)

encoder.save_encoder('./encoder.pkl')
encoder.save_encoder_variables('./encoder_variables')
from tensorflow.keras.models import load_model
import pandas as pd
from clean_text import CleanText
from encode_text import EncodeText
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score

df = pd.read_pickle('./test_data.pkl')

X_test, y_test = df['clean_text'], df.loc[:, df.columns != 'clean_text']

encoder = EncodeText()

encoder.load_encoder('./encoder_files/encoder.pkl')
encoder.load_encoder_variables('./encoder_files/encoder_variables.json')

X_test = encoder.encode_text(X_test, test_data=True)

cnn = load_model('./model_files/cnn_classification_model.h5')
rnn = load_model('./model_files/rnn_classification_model.h5')
hybrid = load_model('./model_files/hybrid_attention_classification_model.h5')

clean = CleanText()

test_text = ['''I cant get my morgan stanley account to connect to EveryDollar. If I cant get it to connect, 
    Im going to need to get a refund. Its the only value I get from the app''']

tt = [clean.prepare_text(t) for t in test_text]
tt = encoder.encode_text(tt, test_data=True)

cnn_res = cnn.predict(tt)