Ejemplo n.º 1
0
 def train_n_classes(self):
     l = LoadData()
     stopWords = l.loadStopWords()
     train_sizes = [100, 200, 300]  # size per class
     for train_size in train_sizes:
         print('Training size:',
               math.floor(train_size * 0.75) * 5, 'Test size:',
               math.ceil(train_size * 0.25) * 5)
         self.loadData(train_size)
         vect = TfidfVectorizer(stop_words=stopWords)
         self.train_and_test_split(0.75)
         classes = {}
         x = 0
         for i in self.data['class_name']:
             if i not in classes:
                 classes[i] = x
                 x += 1
         X_train = vect.fit_transform(self.train_data['data'])
         Y_train = [classes[i] for i in self.train_data['class_name']]
         X_test = vect.transform(self.test_data['data'])
         Y_test = [classes[i] for i in self.test_data['class_name']]
         nb = MultinomialNB()
         Y_pred = nb.fit(X_train, Y_train).predict(X_test)
         self.metric(Y_test, Y_pred)
         print('---------------------------------------------------')
Ejemplo n.º 2
0
 def train(self):
     l = LoadData()
     stopWords = l.loadStopWords()
     self.loadDataCSV('bbc-text.csv')
     vect = TfidfVectorizer(stop_words=stopWords)
     self.train_and_test_split(0.75)
     X_train = vect.fit_transform(self.train_data['data'])
     Y_train = self.train_data['class_name']
     X_test = vect.transform(self.test_data['data'])
     Y_test = self.test_data['class_name']
     nb = MultinomialNB()
     Y_pred = nb.fit(X_train, Y_train).predict(X_test)
     self.metric(Y_test, Y_pred)
Ejemplo n.º 3
0
    def train_1_class(self):
        l = LoadData()
        stopWords = l.loadStopWords()
        train_sizes = [100, 200]  # size per class
        for train_size in train_sizes:
            print('Training size:',
                  math.floor(train_size * 0.75) * 2, 'Test size:',
                  math.ceil(train_size * 0.25) * 2)
            self.loadData(train_size)
            vect = TfidfVectorizer(stop_words=stopWords)

            # balance classes
            temp_class = self.data['class_name'][train_size:]
            temp_data = self.data['data'][train_size:]
            idx = random.choices(range(len(temp_class)), k=train_size)
            temp_class = [temp_class[i] for i in idx]
            temp_data = [temp_data[i] for i in idx]
            del self.data['data'][train_size:]
            del self.data['class_name'][train_size:]
            self.data['class_name'].extend(temp_class)
            self.data['data'].extend(temp_data)

            self.train_and_test_split(0.75)
            X_train = vect.fit_transform(self.train_data['data'])
            Y_train = [
                1 if i == 'business' else 0
                for i in self.train_data['class_name']
            ]
            X_test = vect.transform(self.test_data['data'])
            Y_test = [
                1 if i == 'business' else 0
                for i in self.test_data['class_name']
            ]
            nb = MultinomialNB()
            Y_pred = nb.fit(X_train, Y_train).predict(X_test)
            self.metric(Y_test, Y_pred)
            print('---------------------------------------------------')