def _preprocess_test_data(self): cheat_path = data_path('cheatTestData{}'.format(csv_extension())) if os.path.isfile(cheat_path): return test_data = self._load_data('testData{}'.format(csv_extension())) test_data['sentiment'] = test_data['id'].map(sentiment_from_id) test_data.to_csv(cheat_path, index=False, header=True)
def _convert_to_export(self): destination = cache_path('{}{}.hdf'.format( self.mp.preprocessing_name(), csv_extension())) if os.path.exists(destination): return pandas.read_hdf(destination, 'export') training_data = self._training_data().copy() training_data['use_for_classifier_training'] = True training_data['use_for_score_calculation'] = False training_data['predict_sentiment'] = False training_data['predicted_sentiment'] = None unlabeled_training_data = self._unlabeled_training_data().copy() unlabeled_training_data['use_for_classifier_training'] = False unlabeled_training_data['use_for_score_calculation'] = False unlabeled_training_data['sentiment'] = None unlabeled_training_data['predict_sentiment'] = False unlabeled_training_data['predicted_sentiment'] = None if not use_different_source(): self._preprocess_test_data() testing_data = self._testing_data() testing_data['use_for_classifier_training'] = False testing_data['use_for_score_calculation'] = True testing_data['predict_sentiment'] = True testing_data['predicted_sentiment'] = None reviews = self._post_process( [training_data, unlabeled_training_data, testing_data]) reviews.to_hdf(destination, 'export', mode='w') return reviews
def _convert_to_export(self): destination = cache_path('{}{}.hdf'.format(self.mp.preprocessing_name(), csv_extension())) if os.path.exists(destination): return pandas.read_hdf(destination, 'export') training_data = self._training_data().copy() training_data['use_for_classifier_training'] = True training_data['use_for_score_calculation'] = False training_data['predict_sentiment'] = False training_data['predicted_sentiment'] = None unlabeled_training_data = self._unlabeled_training_data().copy() unlabeled_training_data['use_for_classifier_training'] = False unlabeled_training_data['use_for_score_calculation'] = False unlabeled_training_data['sentiment'] = None unlabeled_training_data['predict_sentiment'] = False unlabeled_training_data['predicted_sentiment'] = None if not use_different_source(): self._preprocess_test_data() testing_data = self._testing_data() testing_data['use_for_classifier_training'] = False testing_data['use_for_score_calculation'] = True testing_data['predict_sentiment'] = True testing_data['predicted_sentiment'] = None reviews = self._post_process([training_data, unlabeled_training_data, testing_data]) reviews.to_hdf(destination, 'export', mode='w') return reviews
def _testing_data(self): if use_different_source(): return self._load_all_data('testing') return self._load_data('cheatTestData{}'.format(csv_extension()))
def _training_data(self): if use_different_source(): return self._load_all_data('training') return self._load_data('labeledTrainData{}'.format(csv_extension()))
def save_reviews_with_topics(self, reviews): destination = cache_path('{}{}.hdf'.format( self.mp.preprocessing_name(), csv_extension())) reviews.to_hdf(destination, 'export', mode='w')
def _unlabeled_training_data(self): if use_different_source(): return self._load_all_data('unlabeled') return self._load_data('unlabeledTrainData{}'.format(csv_extension()))
def save_reviews_with_topics(self, reviews): destination = cache_path('{}{}.hdf'.format(self.mp.preprocessing_name(), csv_extension())) reviews.to_hdf(destination, 'export', mode='w')