Beispiel #1
0
    def _prepare_data(self, data_path, test_size, random_state):
        """Loads data and prepares for training

        Args:
            data_path (str): File path to the data
            test_size (float): Percent of the data to use for the test set
            random_state (int): Seed for randomly splitting data for train and test sets
        """
        ct = CleanText()

        df = pd.read_pickle(data_path)
        df = df[df['issue'] != '']

        df['clean_text'] = df['ticket_text'].apply(
            lambda x: ct.prepare_text(x))

        weights = self._weights_helper(df['issue'])

        trainLines, trainLabels = df['clean_text'], df['issue']
        labels = pd.get_dummies(trainLabels)

        X_train, X_test, y_train, y_test = train_test_split(
            trainLines,
            labels,
            test_size=test_size,
            random_state=random_state,
            stratify=labels)

        encoder = EncodeText()
        length = encoder.max_length(X_train)
        vocab_size = encoder.vocab_size(X_train)
        X_train = encoder.encode_text(X_train)
        X_test = encoder.encode_text(X_test, test_data=True)

        self.weights = weights
        self.labels = labels
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.length = length
        self.vocab_size = vocab_size
df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')]

df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x))

trainLines, trainLabels = df['clean_text'], df['issue']

lb = LabelEncoder()
transformed_labels = lb.fit_transform(trainLabels)
transformed_labels = to_categorical(transformed_labels)

X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels)


length = encoder.max_length(X_train)
vocab_size = encoder.vocab_size(X_train)
X_train = encoder.encode_text(X_train)
X_test = encoder.encode_text(X_test, test_data=True)

inputs1 = Input(shape=(length,))
embedding1 = Embedding(vocab_size, 100)(inputs1)
conv1 = Conv1D(filters=32, kernel_size=4, activation='relu')(embedding1)
drop1 = Dropout(0.5)(conv1)
pool1 = MaxPooling1D(pool_size=2)(drop1)
flat1 = Flatten()(pool1)

dense1 = Dense(64, activation='relu')(flat1)
outputs = Dense(3, activation='softmax')(dense1)

model = Model(inputs=inputs1, outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', metrics.AUC()])