Esempio n. 1
0
def visualize_data():
    folder_path = request.args.get('folder_path', None)
    document_type = request.args.get('document_type', 'xml')
    language = get_lang(request)
    authors = author_stats(get_documents(folder_path=folder_path, document_type=document_type, language=language))
    authors_list = sorted([(key, val) for key, val in authors.iteritems()], key=lambda pair: pair[1], reverse=True)
    return jsonify({'result': authors_list})
Esempio n. 2
0
def main(
        corpus_path, project_name, features,
        max_authors=0, filter_authors_with_less_than_docs=10, docs_per_author_used=10,
        external_corpus_path=None, max_iterations=100, explain=False):
    """

    :param corpus_path: path to a dictionary with xml files
    :param project_name: unique project code
    :param features: list of feature names
    :param max_authors: max. number of candidates
    :param filter_authors_with_less_than_docs: ignore authors with less than X documents
    :param docs_per_author_used: use only X documents from author for both training and testing
    :param external_corpus_path: path to another corpus with documents used for feature tuning
    :param max_iterations: max. number of a test repetition
    :param explain: write to stdout some explain notes

    :return: 2-tuple dictionary stats and int number of candidates
    """
    total_stats = []
    docs = [doc for doc in get_documents(corpus_path)]
    if external_corpus_path:
        external_documents = [doc for doc in get_documents(external_corpus_path)]
    else:
        external_documents = None

    features_prepared = False
    run_iterations = 0
    train, test = np.array([]), np.array([])
    while run_iterations < max_iterations:
        for iteration, (prepare, train, test) in enumerate(
                divide_documents(
                    docs, max_authors=max_authors, external_documents=external_documents,
                    min_per_author_abs=filter_authors_with_less_than_docs, doc_per_author=docs_per_author_used)):
            if not features_prepared:
                prepare_features(features, prepare, project_name)
                features_prepared = True
            ml = UniversalModel(seed=run_iterations, mode="one_layer")
            x_train = get_features(train, project_name, features)[0]
            y_train = [doc["author"] for doc in train]
            x_test = get_features(test, project_name, features)[0]
            y_test = [doc["author"] for doc in test]
            ml.train(x_train, y_train)

            iteration_stats = {True: 0, False: 0}
            for pos, (y, predictions) in enumerate(six.moves.zip(y_test, ml.test(x_test))):
                instance_result = predictions.get(y, 0.0) >= max(predictions.values())
                iteration_stats[instance_result] += 1
                if explain > 0 and run_iterations == 0:
                    if not instance_result:
                        explain -= 1
                        for key in ('author', 'text', 'morphology'):
                            print('%s: %s' % (key, test[pos][key]))

            total_stats.append(iteration_stats)
            run_iterations += 1
            logger.info('Iteration %i done', run_iterations)
            if iteration >= max_iterations:
                break
    task_size = len(author_stats(train + test))
    accuracies = np.array([100.0 * stats[True] / (stats[True] + stats[False]) for stats in total_stats])
    avg = np.average(accuracies)
    delta = math.sqrt(np.average(np.square(accuracies - avg)))

    # second round, remove irregular values
    accuracies = [acc for acc in accuracies if acc <= avg + delta * 2]
    avg = np.average(accuracies)
    delta = math.sqrt(np.average(np.square(accuracies - avg)))

    return {"accuracy": avg, "delta": delta, "task_size": task_size, "iterations": len(total_stats)}