def _preprocess_test_data(self): cheat_path = data_path('cheatTestData{}'.format(csv_extension())) if os.path.isfile(cheat_path): return test_data = self._load_data('testData{}'.format(csv_extension())) test_data['sentiment'] = test_data['id'].map(sentiment_from_id) test_data.to_csv(cheat_path, index=False, header=True)
def _load_all_data(self, review_group): all_docs = dict(training=[], testing=[], unlabeled=[]) with open(data_path('alldata-id.txt')) as lines: for line_no, line in enumerate(lines): tokens = gensim.utils.to_unicode(line).split() review_id = line_no + 1 words = tokens[1:] split = ['training', 'testing', 'unlabeled', 'unlabeled'][line_no // 25000] sentiment = [1, 0, 1, 0, 0, 0, 0, 0][line_no // 12500] all_docs[split].append(dict(id=review_id, sentiment=sentiment, review=' '.join(words))) ret = pandas.DataFrame(all_docs[review_group]) return ret
def _load_all_data(self, review_group): all_docs = dict(training=[], testing=[], unlabeled=[]) with open(data_path('alldata-id.txt')) as lines: for line_no, line in enumerate(lines): tokens = gensim.utils.to_unicode(line).split() review_id = line_no + 1 words = tokens[1:] split = ['training', 'testing', 'unlabeled', 'unlabeled'][line_no // 25000] sentiment = [1, 0, 1, 0, 0, 0, 0, 0][line_no // 12500] all_docs[split].append( dict(id=review_id, sentiment=sentiment, review=' '.join(words))) ret = pandas.DataFrame(all_docs[review_group]) return ret
def _load_data(self, filename): return pandas.read_csv(data_path(filename), header=0)