def _preprocess_test_data(self):
        cheat_path = data_path('cheatTestData{}'.format(csv_extension()))
        if os.path.isfile(cheat_path):
            return

        test_data = self._load_data('testData{}'.format(csv_extension()))
        test_data['sentiment'] = test_data['id'].map(sentiment_from_id)
        test_data.to_csv(cheat_path, index=False, header=True)
    def _preprocess_test_data(self):
        cheat_path = data_path('cheatTestData{}'.format(csv_extension()))
        if os.path.isfile(cheat_path):
            return

        test_data = self._load_data('testData{}'.format(csv_extension()))
        test_data['sentiment'] = test_data['id'].map(sentiment_from_id)
        test_data.to_csv(cheat_path, index=False, header=True)
 def _load_all_data(self, review_group):
     all_docs = dict(training=[], testing=[], unlabeled=[])
     with open(data_path('alldata-id.txt')) as lines:
         for line_no, line in enumerate(lines):
             tokens = gensim.utils.to_unicode(line).split()
             review_id = line_no + 1
             words = tokens[1:]
             split = ['training', 'testing', 'unlabeled', 'unlabeled'][line_no // 25000]
             sentiment = [1, 0, 1, 0, 0, 0, 0, 0][line_no // 12500]
             all_docs[split].append(dict(id=review_id, sentiment=sentiment, review=' '.join(words)))
     ret = pandas.DataFrame(all_docs[review_group])
     return ret
 def _load_all_data(self, review_group):
     all_docs = dict(training=[], testing=[], unlabeled=[])
     with open(data_path('alldata-id.txt')) as lines:
         for line_no, line in enumerate(lines):
             tokens = gensim.utils.to_unicode(line).split()
             review_id = line_no + 1
             words = tokens[1:]
             split = ['training', 'testing', 'unlabeled',
                      'unlabeled'][line_no // 25000]
             sentiment = [1, 0, 1, 0, 0, 0, 0, 0][line_no // 12500]
             all_docs[split].append(
                 dict(id=review_id,
                      sentiment=sentiment,
                      review=' '.join(words)))
     ret = pandas.DataFrame(all_docs[review_group])
     return ret
 def _load_data(self, filename):
     return pandas.read_csv(data_path(filename), header=0)
 def _load_data(self, filename):
     return pandas.read_csv(data_path(filename), header=0)