def model_selection(feature_selector, classifier, dataset, write_filename=None): feature_selector.attach_dataset(dataset) feature_sel_name = feature_selector.__class__.__name__ classifier_name = classifier.__class__.__name__ base_filename = write_filename exec_mode = 'manytomany' if '_exec_mode' in config.get('model_classifier_params'): exec_mode = config.get('model_classifier_params').get('_exec_mode') if '_iter' in config.get('model_selector_params').get(feature_sel_name): _sel_params = config.get('model_selector_params').get(feature_sel_name).get('_iter') else: _sel_params = [config.get('model_selector_params').get(feature_sel_name)] if config.get('model_selector_params').get(feature_sel_name) is not None else [{}] cl_params_list = config.get('model_classifier_params').get(classifier_name) if config.get('model_classifier_params').get(classifier_name) is not None else [{}] all_classifier_stats = Stats() for cl_param in cl_params_list: classifier_stats = Stats() if exec_mode == 'one2one': sel_params = [_sel_params.next()] else: sel_params = _sel_params for sel_param in sel_params: feature_selector.select(sel_param) best=[] f_idx = 0 for data in feature_selector.training_data(): f_idx = f_idx+1 if isinstance(data, types.GeneratorType): for pair in data: stats = Stats.run_timed(lambda :classifier.fit(pair, cl_param)) feature_selector.eval(stats) stats = feature_selector.eval_set() else: stats = Stats.run_timed(lambda :classifier.fit(data, cl_param)) stats.add_metric(list(data[0].axes[0].values), '_features') if base_filename is not None: write_filename = base_filename+'_'+classifier.desc()+'_'+feature_selector.desc() classifier_stats.add_classifier_stat(stats) print('Finished model selection with classifer "{}" with {} feature selector "{}"'.format(classifier.desc(), f_idx, feature_selector.desc())) stats.set_printheader(stat_header({'FeatureCount': len(stats.metrics['_features'][0]), 'Set':f_idx}, classifier.desc())) # cond = lambda s: s.conf_based_stats()[1] > .70 and (1-s.conf_based_stats()[2]) < .35 cond = None stats.mystats(filename=write_filename, cond=cond) ## Thresholding for good feature set selection which will be saved later # if stats.conf_based_stats()[1] > .70 and (1-stats.conf_based_stats()[2]) < .35: best = [] best.append(stats.metrics['_features'][0]) stats.add_metric(stats.conf_based_stats()[0], 'Accuracy') stats.add_metric(stats.conf_based_stats()[3], 'f1-score') #save the plot to file if write_filename is not None: # classifier_stats.classifier_stats(filename=utils.replace_with_(write_filename), title='{} \n {}'.format(classifier.desc(),feature_selector.desc())) best_filename = config.get('best_features_file')+'_'+utils.replace_with_(write_filename) else: best_filename = utils.replace_with_(config.get('best_features_file')) #save the best features to file if len(best) > 0: utils.save_string_data(os.path.join(config.get('output_dir'), best_filename), best) all_classifier_stats.add_classifiers_stat(classifier_stats, classifier.desc()) return all_classifier_stats
def test(feature_selector, classifier, dataset, write_filename=None): feature_selector.attach_dataset(dataset) base_filename = write_filename feature_sel_name = feature_selector.__class__.__name__ classifier_name = classifier.__class__.__name__ exec_mode = 'manytomany' if '_exec_mode' in config.get('test_classifier_params'): exec_mode = config.get('test_classifier_params').get('_exec_mode') if '_iter' in config.get('test_selector_params').get(feature_sel_name): _sel_params = config.get('test_selector_params').get(feature_sel_name).get('_iter') else: _sel_params = [config.get('test_selector_params').get(feature_sel_name)] if config.get('test_selector_params').get(feature_sel_name) is not None else [{}] cl_params_list = config.get('test_classifier_params').get(classifier_name) if config.get('test_classifier_params').get(classifier_name) is not None else [{}] all_classifier_stats = Stats() for cl_param in cl_params_list: classifier_stats = Stats() if exec_mode == 'one2one': sel_params = [_sel_params.next()] else: sel_params = _sel_params for sel_param in sel_params: feature_selector.select(sel_param) test_data_gen = feature_selector.test_data() f_idx = 0 for data in feature_selector.training_data(): f_idx = f_idx+1 st = Stats.run_timed(lambda :classifier.fit(data, cl_param)) if base_filename is not None: write_filename = base_filename+'_'+classifier.desc()+'_'+feature_selector.desc() print('Finished testing with classifer "{}" with feature selector "{}"'.format(classifier.desc(), feature_selector.desc())) test_data = test_data_gen.next() stats = classifier.predict(test_data[0]) stats.set_printheader(stat_header({'FeatureCount':data[0].shape[0], 'Set':f_idx}, classifier.desc())) stats.record_confusion_matrix(test_data[1]) #writes data to a file/console if the optional condition is true cond = lambda s: s.conf_based_stats()[0]>.6 cond = None idents = test_data[0].axes[1].values stats.mystats(filename=write_filename, cond=cond, dataset=dataset, ids=idents) classifier_stats.add_classifier_stat(stats) all_classifier_stats.add_classifiers_stat(classifier_stats, classifier.desc()) return all_classifier_stats