def main(dataset, output, epsilon, capacity, width, kernel_type): LOGGER.info("SVM Multiclass classifier") LOGGER.info("Epsilon: %s" % epsilon) LOGGER.info("Capacity: %s" % capacity) LOGGER.info("Gaussian width: %s" % width) # Get features feats, labels = get_features_and_labels(LibSVMFile(dataset)) # Create kernel try: kernel = KERNELS[kernel_type](feats, width) except KeyError: LOGGER.error("Kernel %s not available. try Gaussian or Linear" % kernel_type) # Initialize and train Multiclass SVM svm = MulticlassLibSVM(capacity, kernel, labels) svm.set_epsilon(epsilon) with track_execution(): svm.train() # Serialize to file writable_file = SerializableHdf5File(output, 'w') with closing(writable_file): svm.save_serializable(writable_file) LOGGER.info("Serialized classifier saved in: '%s'" % output)
def main(classifier, testset, output): LOGGER.info("SVM Multiclass evaluation") svm = MulticlassLibSVM() serialized_classifier = SerializableHdf5File(classifier, 'r') with closing(serialized_classifier): svm.load_serializable(serialized_classifier) test_feats, test_labels = get_features_and_labels(LibSVMFile(testset)) predicted_labels = svm.apply(test_feats) with open(output, 'w') as f: for cls in predicted_labels.get_labels(): f.write("%s\n" % int(cls)) LOGGER.info("Predicted labels saved in: '%s'" % output)
def main(actual, predicted): LOGGER.info("SVM Multiclass evaluator") # Load SVMLight dataset feats, labels = get_features_and_labels(LibSVMFile(actual)) # Load predicted labels with open(predicted, 'r') as f: predicted_labels_arr = np.array([float(l) for l in f]) predicted_labels = MulticlassLabels(predicted_labels_arr) # Evaluate accuracy multiclass_measures = MulticlassAccuracy() LOGGER.info("Accuracy = %s" % multiclass_measures.evaluate( labels, predicted_labels)) LOGGER.info("Confusion matrix:") res = multiclass_measures.get_confusion_matrix(labels, predicted_labels) print res
def train_models(num_splits, num_iter, metric): random_state = np.random.RandomState(1234) outputs = [] data_dir = os.path.join('data', 'train') train_files = os.path.join(data_dir, '*.pkl') algorithms = [{ 'name': 'naive-bayes', 'acronym': 'NB' }, { 'name': 'support-vector-machine', 'acronym': 'SVM' }, { 'name': 'logistic-regression', 'acronym': 'LR' }, { 'name': 'random-forest', 'acronym': 'RF' }, { 'name': 'gradient-boosting', 'acronym': 'GB' }] # Train models print('Training models') kfold = KFold(n_splits=num_splits, shuffle=True, random_state=random_state) files = glob.glob(train_files) num_files = len( [name for name in os.listdir(data_dir) if os.path.isfile(name)]) num_loops = num_files * len(algorithms) with tqdm(total=num_loops, file=sys.stdout) as pbar: for file in files: with open(file, 'rb') as f: data = joblib.load(f) train_data = data['data'] y_train = train_data['label'].values text_features = list(train_data.iloc[:, 0].values) extra_features = np.array(train_data.iloc[:, 2:].values) X_train = np.concatenate((text_features, extra_features), axis=1) for algorithm in algorithms: classifier = get_classifier(algorithm['acronym'], random_state) scores = cross_val_score(classifier, X_train, y=y_train, scoring=metric, cv=kfold, n_jobs=-1) outputs.append({ 'algorithm': algorithm['acronym'], 'train_filename': file, 'metric_scores': scores, }) pbar.update(1) # Load results in a dataframe print('Loading experiment results in a dataframe') output_df = pd.DataFrame( columns=['algorithm', 'train_data_file', f'mean_{metric}']) for output in outputs: row = { 'algorithm': output['algorithm'], 'train_data_file': output['train_filename'], f'mean_{metric}': round(output['metric_scores'].mean(), 2), f'std_{metric}': round(output['metric_scores'].std(), 2) } output_df = output_df.append(row, ignore_index=True) # Save dataframe experiment_dir = 'experiments' os.makedirs(experiment_dir, exist_ok=True) experiment_filename = 'e_{}.csv'.format( datetime.now().strftime('%d%m%Y_%H%M%S')) experiment_file_path = os.path.join(experiment_dir, experiment_filename) print(f'Saving experiment results in {experiment_file_path}') output_df.to_csv(experiment_file_path, index=False) # Train algorithms on data transformation that work best for each of them print('Doing hyperparametrization') with tqdm(total=len(algorithms), file=sys.stdout) as pbar: for algorithm in algorithms: if algorithm['acronym'] != 'NB': best_model = output_df[output_df['algorithm']==algorithm['acronym']].\ sort_values(by=[f'mean_{metric}', f'std_{metric}'], ascending=False).head(1) train_data_file = best_model['train_data_file'].values[0] best_model = do_train_model(algorithm['acronym'], train_data_file, algorithm['name'], num_splits, num_iter, metric) features, labels = get_features_and_labels(train_data_file) plot_learning_curve(best_model, f"{algorithm['acronym']} learning curves", features, labels, metric, cv=kfold, shuffle=True, save_fig=True) pbar.update(1)