def evaluate_statistical(self): """Evaluates the system using 10-fold cross validation, returning a dictionary containing the number of correct results per-fold in each class.""" trainer = Trainer(self.profiles, self.profile_type, self.converter, self.network) training_set = trainer.generate_training_set() profiles = numpy.array(list(self.profiles)) data = numpy.array(training_set.data) labels = numpy.array(training_set.labels) fold_iterator = cross_validation.StratifiedKFold(labels, n_folds=10, shuffle=True, random_state=42) official_profile_pairs = ((x['name'], self.profile_type(x['profile'], x['posts'])) for x in self.profiles if x['label'] == 2) affiliate_profile_pairs = ((x['name'], self.profile_type(x['profile'], x['posts'])) for x in self.profiles if x['label'] == 1) official_profiles = defaultdict(list) for name, profile in official_profile_pairs: official_profiles[name].append(profile) affiliate_profiles = defaultdict(list) for name, profile in affiliate_profile_pairs: affiliate_profiles[name].append(profile) fold = 1 # This assumes we're just using Random Forest (i.e. one classifier) # Ugly hack for now. classification_results = { 'official_correct': [], 'affiliate_correct': [] } for train, test in fold_iterator: classifiers = initialize_classifiers() training_data = data[train] training_labels = labels[train] test_set = itertools.compress(profiles[test], labels[test]) company_names = set(x['name'] for x in test_set) print 'Test set', fold, '-', len(company_names), 'companies.' for classifier in classifiers: classifier_name = classifier['type'] c = classifier['classifier'] trained = c.fit(training_data, training_labels) system = SingleNetworkSearcher( classifier=trained, searchengine=self.search_engine, profile_converter=self.converter, network=self.network) number_of_workers = int(multiprocessing.cpu_count() * 0.75) worker_pool = ProcessingPool(number_of_workers) all_results = worker_pool.map(system.query, company_names) for idx, name in enumerate(company_names): official_results = official_profiles[name] affiliate_results = affiliate_profiles[name] results = all_results[idx] classified_official = results.official classified_affiliate = results.affiliate marked_official_handles = [x['profile'].handle.lower() for x in classified_official] marked_affiliate_handles = [x['profile'].handle.lower() for x in classified_affiliate] official_handles = [x.handle.lower() for x in official_results] affiliate_handles = [x.handle.lower() for x in affiliate_results] official_correct = 0 for handle in marked_official_handles: if handle in official_handles: official_correct += 1 affiliate_correct = 0 for handle in marked_affiliate_handles: if handle in affiliate_handles: affiliate_correct += 1 classification_results['official_correct'].append(official_correct) classification_results['affiliate_correct'].append(affiliate_correct) fold += 1 return classification_results
def evaluate(self): """Evaluates the system using 10-fold cross validation, returning a dictionary of results keyed by classifier type.""" trainer = Trainer(self.profiles, self.profile_type, self.converter, self.network) training_set = trainer.generate_training_set() profiles = numpy.array(list(self.profiles)) data = numpy.array(training_set.data) labels = numpy.array(training_set.labels) fold_iterator = cross_validation.StratifiedKFold(labels, n_folds=10, shuffle=True, random_state=42) official_profile_pairs = ((x['name'], self.profile_type(x['profile'], x['posts'])) for x in self.profiles if x['label'] == 2) affiliate_profile_pairs = ((x['name'], self.profile_type(x['profile'], x['posts'])) for x in self.profiles if x['label'] == 1) official_profiles = defaultdict(list) for name, profile in official_profile_pairs: official_profiles[name].append(profile) affiliate_profiles = defaultdict(list) for name, profile in affiliate_profile_pairs: affiliate_profiles[name].append(profile) classification_results = defaultdict(list) fold = 1 for train, test in fold_iterator: classifiers = initialize_classifiers() training_data = data[train] training_labels = labels[train] test_set = itertools.compress(profiles[test], labels[test]) company_names = set(x['name'] for x in test_set) print 'Test set', fold, '-', len(company_names), 'companies.' for classifier in classifiers: classifier_name = classifier['type'] c = classifier['classifier'] trained = c.fit(training_data, training_labels) system = SingleNetworkSearcher( classifier=trained, searchengine=self.search_engine, profile_converter=self.converter, network=self.network) number_of_workers = int(multiprocessing.cpu_count() * 0.75) worker_pool = ProcessingPool(number_of_workers) all_results = worker_pool.map(system.query, company_names) combined_official_results = [] combined_affiliate_results = [] for idx, name in enumerate(company_names): official_results = official_profiles[name] affiliate_results = affiliate_profiles[name] results = all_results[idx] classified_official = results.official classified_affiliate = results.affiliate classified_unrelated = results.unrelated marked_official_handles = [x['profile'].handle.lower() for x in classified_official] marked_affiliate_handles = [x['profile'].handle.lower() for x in classified_affiliate] marked_unrelated_handles = [x['profile'].handle.lower() for x in classified_unrelated] official_handles = [x.handle.lower() for x in official_results] affiliate_handles = [x.handle.lower() for x in affiliate_results] official_counts = MetricCalculator.count_positives( actual_handles=official_handles, marked_positive_handles=marked_official_handles, marked_negative_handles=(marked_affiliate_handles + marked_unrelated_handles)) combined_official_results.append(official_counts) affiliate_counts = MetricCalculator.count_positives( actual_handles=affiliate_handles, marked_positive_handles=marked_affiliate_handles, marked_negative_handles=(marked_unrelated_handles + marked_official_handles)) combined_affiliate_results.append(affiliate_counts) official_metrics = MetricCalculator.fold_metrics( combined_official_results) affiliate_metrics = MetricCalculator.fold_metrics( combined_affiliate_results) result = { 'official': official_metrics, 'affiliate': affiliate_metrics } classification_results[classifier_name].append(result) fold += 1 return classification_results