def train_n_classes(self): l = LoadData() stopWords = l.loadStopWords() train_sizes = [100, 200, 300] # size per class for train_size in train_sizes: print('Training size:', math.floor(train_size * 0.75) * 5, 'Test size:', math.ceil(train_size * 0.25) * 5) self.loadData(train_size) vect = TfidfVectorizer(stop_words=stopWords) self.train_and_test_split(0.75) classes = {} x = 0 for i in self.data['class_name']: if i not in classes: classes[i] = x x += 1 X_train = vect.fit_transform(self.train_data['data']) Y_train = [classes[i] for i in self.train_data['class_name']] X_test = vect.transform(self.test_data['data']) Y_test = [classes[i] for i in self.test_data['class_name']] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred) print('---------------------------------------------------')
def train(self): l = LoadData() stopWords = l.loadStopWords() self.loadDataCSV('bbc-text.csv') vect = TfidfVectorizer(stop_words=stopWords) self.train_and_test_split(0.75) X_train = vect.fit_transform(self.train_data['data']) Y_train = self.train_data['class_name'] X_test = vect.transform(self.test_data['data']) Y_test = self.test_data['class_name'] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred)
def train_1_class(self): l = LoadData() stopWords = l.loadStopWords() train_sizes = [100, 200] # size per class for train_size in train_sizes: print('Training size:', math.floor(train_size * 0.75) * 2, 'Test size:', math.ceil(train_size * 0.25) * 2) self.loadData(train_size) vect = TfidfVectorizer(stop_words=stopWords) # balance classes temp_class = self.data['class_name'][train_size:] temp_data = self.data['data'][train_size:] idx = random.choices(range(len(temp_class)), k=train_size) temp_class = [temp_class[i] for i in idx] temp_data = [temp_data[i] for i in idx] del self.data['data'][train_size:] del self.data['class_name'][train_size:] self.data['class_name'].extend(temp_class) self.data['data'].extend(temp_data) self.train_and_test_split(0.75) X_train = vect.fit_transform(self.train_data['data']) Y_train = [ 1 if i == 'business' else 0 for i in self.train_data['class_name'] ] X_test = vect.transform(self.test_data['data']) Y_test = [ 1 if i == 'business' else 0 for i in self.test_data['class_name'] ] nb = MultinomialNB() Y_pred = nb.fit(X_train, Y_train).predict(X_test) self.metric(Y_test, Y_pred) print('---------------------------------------------------')