Beispiel #1
0
 def test_get_bow_and_categories(self):
     imdb = Imdb(config.DATASETS_FOLDER)
     num_features = 5000
     (train_x_bow, train_categories), (_, _) = imdb.get_bow_and_categories(
         max_features=num_features)
     self.assertEqual(train_x_bow.shape, (25000, num_features))
     self.assertEqual(len(train_categories), 25000)
Beispiel #2
0
    def __init__(self, image_set, year, use_diff=False):
        name = 'voc_' + year + '_' + image_set
        if use_diff:
            name += '_diff'
        Imdb.__init__(self, name)
        self._year = year
        self._image_set = image_set
        self._devkit_path = self._get_default_path()
        self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
        self._classes = (
            '__background__',  # always index 0
            'aeroplane',
            'bicycle',
            'bird',
            'boat',
            'bottle',
            'bus',
            'car',
            'cat',
            'chair',
            'cow',
            'diningtable',
            'dog',
            'horse',
            'motorbike',
            'person',
            'pottedplant',
            'sheep',
            'sofa',
            'train',
            'tvmonitor')
        self._class_to_ind = dict(
            list(zip(self.classes, list(range(self.num_classes)))))
        self._image_ext = '.jpg'
        self._image_index = self._load_image_set_index()
        # Default to roidb handler
        self._roidb_handler = self.gt_roidb
        self._salt = str(uuid.uuid4())
        self._comp_id = 'comp4'

        # PASCAL specific config options
        self.config = {
            'cleanup': True,
            'use_salt': True,
            'use_diff': use_diff,
            'matlab_eval': False,
            'rpn_file': None
        }

        assert os.path.exists(self._devkit_path), \
          'VOCdevkit path does not exist: {}'.format(self._devkit_path)
        assert os.path.exists(self._data_path), \
          'Path does not exist: {}'.format(self._data_path)
Beispiel #3
0
    def rpn_roidb(self):
        if int(self._year) == 2007 or self._image_set != 'test':
            gt_roidb = self.gt_roidb()
            rpn_roidb = self._load_rpn_roidb(gt_roidb)
            roidb = Imdb.merge_roidbs(gt_roidb, rpn_roidb)
        else:
            roidb = self._load_rpn_roidb(None)

        return roidb
Beispiel #4
0
# Accuracy: 
    0.76628 (with 5k words)
    0.80664 (with 50k words)
    0.81732 (with 250k words)
    0.81828 (with 500k words)
    0.81892 (with entire vocab)
"""

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import config
from datasets.imdb import Imdb

# Load dataset
print('Loading dataset...', flush=True)
imdb = Imdb(config.DATASETS_FOLDER)
(train_x_bof, train_y), (test_x_bof,
                         test_y) = imdb.get_bof_fasttext_wiki_news_300d_1M()

# Train LR model
print('Training model...', flush=True)
lm = LogisticRegression()
lm.fit(train_x_bof, train_y)

# Predict and score on test set
ps = lm.predict(test_x_bof)
acc = accuracy_score(test_y, ps)
print(f'Accuracy: {acc}', flush=True)
Beispiel #5
0
"""

from torch import FloatTensor as T
from torch.autograd import Variable as V
from torch.nn import CrossEntropyLoss
from torch import nn
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

import config
from datasets.imdb import Imdb

# Load dataset
print('Loading dataset...')
imdb = Imdb(config.DATASETS_FOLDER)
(train_x_bow,
 train_y), (test_x_bow,
            test_y) = imdb.get_bow_and_categories(max_features=5000)

# Pack dataset to torch Variables
train_x_bow = V(T(train_x_bow.toarray()), requires_grad=False)
test_x_bow = V(T(test_x_bow.toarray()), requires_grad=False)
train_y = V(T(train_y), requires_grad=False).long()
test_y = V(T(test_y), requires_grad=False).long()

# Compute train mean and std
train_mean = train_x_bow.mean(0)
train_std = train_x_bow.std(0)

# Normalize train and test sets
Beispiel #6
0
 def test_get_texts_and_categories(self):
     imdb = Imdb(config.DATASETS_FOLDER)
     (train_texts, train_categories), (_,
                                       _) = imdb.get_texts_and_categories()
     self.assertEqual(len(train_texts), 25000)
     self.assertEqual(len(train_categories), 25000)
Beispiel #7
0
# Accuracy
  0.85144 (with 5k words)
  0.87032 (with the entire vocab)

"""

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import config
from datasets.imdb import Imdb

# Load dataset
print('Loading dataset...')
imdb = Imdb(config.DATASETS_FOLDER)
(train_x_bow, train_y), (test_x_bow, test_y) = imdb.get_bow_and_categories(max_features=5000)


# Train LR model
print('Training model...')
lm = LogisticRegression()
lm.fit(train_x_bow, train_y)

# Predict and score on test set
ps = lm.predict(test_x_bow)
acc = accuracy_score(test_y, ps)
print(f'Accuracy: {acc}')