def test_best_solution(self, dataset_key = 'tweets', regression = 'linear_regression1', classification = 'naive_bayes1'): dataset = self.datasets[dataset_key] models_manager = ModelManager() regression_model = models_manager.get_models(dataset = dataset, model_key = regression)[0] classification_model = models_manager.get_models(dataset = dataset, model_key = classification, model_type = 'classifier')[0] final_dataset = self.datasets['final'] regression_model.prediction_file = PREDICTION_PATH + regression_model.name + '_' + final_dataset.dataset_key + '_prediction.dat' classification_model.prediction_file = PREDICTION_PATH + classification_model.name + '_' + final_dataset.dataset_key + '_prediction.dat' regression_model.test_model(test_data = final_dataset.test_data_regression, empty_solution = final_dataset.empty_solution) classification_model.test_model(test_data = final_dataset.test_data_classification, empty_solution = final_dataset.empty_solution) regression_solution = read_sheet(file_name = regression_model.prediction_file) classification_solution = read_sheet(file_name = classification_model.prediction_file) solution_obj = Solution() solution = map(lambda x, y: solution_obj.combine_classification_regression(x, y), regression_solution, classification_solution) discretize_solution(prediction_in = solution, file_out = DATASET_PATH + 'final_solution.dat')
def test_model(self, test_data, empty_solution, evaluate = False): model_weka = None if os.path.isfile(self.prediction_file): print 'Model ' + self.name + ' already tested.' elif not os.path.isfile(self.model_file): print 'Impossible testing this model. It should be trained first.' return else: print 'Starting to test_model model ' + self.name + '.' model_weka = Classifier(jobject = serialization.read(self.model_file)) evaluation = Evaluation(data = test_data) evaluation.test_model(classifier = model_weka, data = test_data) predictions = evaluation.predictions() rows = read_sheet(file_name = empty_solution) solutions = [] for row in rows: solution = [row['userid'], row['tweetid'], predictions.pop(0).predicted()] solutions.append(solution) write_the_solution_file(solutions, self.prediction_file) print 'Model ' + self.name + ' tested.' if evaluate == True: if os.path.isfile(self.evaluation_file): print 'Model ' + self.name + ' already evaluated.' return elif model_weka == None: model_weka = Classifier(jobject = serialization.read(self.model_file)) evaluation = Evaluation(data = test_data) evaluation.test_model(classifier = model_weka, data = test_data) save_file(file_name = self.evaluation_file, content = evaluation.to_summary()) print 'Model ' + self.name + ' evaluated.'
def create_solution(self, dataset, force, solutions_file = None): models_manager = ModelManager() if os.path.isfile(self.solution_file) and force == False: print 'Solution ' + self.name + ' ' + self.classification + ' ' + self.regression + ' already created.' return elif self.classification == 'None': models = models_manager.get_models(dataset = dataset, model_key = self.regression) if len(models) == 1: discretize_solution(file_in = models[0].prediction_file, file_out = self.solution_file) else: if self.regression == 'ranking': print solutions_file solutions_models = [read_sheet(file_name = solution_file) for solution_file in solutions_file] regressions = map(lambda x: self._order(x), solutions_models) else: regressions = [read_sheet(file_name = model.prediction_file) for model in models] regression = map(lambda *args: self._combine_regressions(*args), *regressions) discretize_solution(prediction_in = regression, file_out = self.solution_file) else: regression_models = models_manager.get_models(dataset = dataset, model_key = self.regression) classification_models = models_manager.get_models(dataset = dataset, model_key = self.classification, model_type = 'classifier') regression_solution = read_sheet(file_name = regression_models[0].prediction_file) if self.classification == 'voting': classification_solutions = [read_sheet(file_name = classification.prediction_file) for classification in classification_models] #print classification_solutions solution = map(lambda r, *c: self._combine_classifications_regression(r, *c), regression_solution, *classification_solutions) else: classification_solution = read_sheet(file_name = classification_models[0].prediction_file) solution = map(lambda x, y: self.combine_classification_regression(x, y), regression_solution, classification_solution) discretize_solution(prediction_in = solution, file_out = self.solution_file) print 'Solution ' + self.name + ' ' + self.classification + ' ' + self.regression + ' created.'
def test_solution(self): rows = read_sheet(file_name = DATASET_PATH + 'empty_real_solution.dat') solutions = [] for row in rows: solution = {'userid': row['userid'], 'tweetid': row['tweetid'], 'engagement': 0.0} solutions.append(solution) discretize_solution(prediction_in = solutions, file_out = DATASET_PATH + 'teste_zeros.dat') #write_the_solution_file(solutions, self.prediction_file)
def order_solution(): solutions = read_sheet(DATASET_PATH + 'neural_solution.dat') # Sort the solutions on user id (desc), engagement (desc) and tweet id (desc) solutions = sorted(solutions, key=lambda data: (-int(data['userid']), -float(data['engagement']), -int(data['tweetid']))) solution_final = [] for solution in solutions: solution_final.append([solution['userid'], solution['tweetid'], solution['engagement']]) # Write the _solution file write_the_solution_file(solution_final, DATASET_PATH + 'neural_solution2.dat')