def visualize_data(): folder_path = request.args.get('folder_path', None) document_type = request.args.get('document_type', 'xml') language = get_lang(request) authors = author_stats(get_documents(folder_path=folder_path, document_type=document_type, language=language)) authors_list = sorted([(key, val) for key, val in authors.iteritems()], key=lambda pair: pair[1], reverse=True) return jsonify({'result': authors_list})
def main( corpus_path, project_name, features, max_authors=0, filter_authors_with_less_than_docs=10, docs_per_author_used=10, external_corpus_path=None, max_iterations=100, explain=False): """ :param corpus_path: path to a dictionary with xml files :param project_name: unique project code :param features: list of feature names :param max_authors: max. number of candidates :param filter_authors_with_less_than_docs: ignore authors with less than X documents :param docs_per_author_used: use only X documents from author for both training and testing :param external_corpus_path: path to another corpus with documents used for feature tuning :param max_iterations: max. number of a test repetition :param explain: write to stdout some explain notes :return: 2-tuple dictionary stats and int number of candidates """ total_stats = [] docs = [doc for doc in get_documents(corpus_path)] if external_corpus_path: external_documents = [doc for doc in get_documents(external_corpus_path)] else: external_documents = None features_prepared = False run_iterations = 0 train, test = np.array([]), np.array([]) while run_iterations < max_iterations: for iteration, (prepare, train, test) in enumerate( divide_documents( docs, max_authors=max_authors, external_documents=external_documents, min_per_author_abs=filter_authors_with_less_than_docs, doc_per_author=docs_per_author_used)): if not features_prepared: prepare_features(features, prepare, project_name) features_prepared = True ml = UniversalModel(seed=run_iterations, mode="one_layer") x_train = get_features(train, project_name, features)[0] y_train = [doc["author"] for doc in train] x_test = get_features(test, project_name, features)[0] y_test = [doc["author"] for doc in test] ml.train(x_train, y_train) iteration_stats = {True: 0, False: 0} for pos, (y, predictions) in enumerate(six.moves.zip(y_test, ml.test(x_test))): instance_result = predictions.get(y, 0.0) >= max(predictions.values()) iteration_stats[instance_result] += 1 if explain > 0 and run_iterations == 0: if not instance_result: explain -= 1 for key in ('author', 'text', 'morphology'): print('%s: %s' % (key, test[pos][key])) total_stats.append(iteration_stats) run_iterations += 1 logger.info('Iteration %i done', run_iterations) if iteration >= max_iterations: break task_size = len(author_stats(train + test)) accuracies = np.array([100.0 * stats[True] / (stats[True] + stats[False]) for stats in total_stats]) avg = np.average(accuracies) delta = math.sqrt(np.average(np.square(accuracies - avg))) # second round, remove irregular values accuracies = [acc for acc in accuracies if acc <= avg + delta * 2] avg = np.average(accuracies) delta = math.sqrt(np.average(np.square(accuracies - avg))) return {"accuracy": avg, "delta": delta, "task_size": task_size, "iterations": len(total_stats)}