Ejemplo n.º 1
0
    def handle(self, *args, **options):
        path = options['path']
        source = 'None'
        if options['source']:
            source = options['source']
        print(f'Loading data from: {path}')
        try:
            articles = []
            article_files = [join(path, article) for article in listdir(path)
                             if isfile(join(path, article)) and len(article) > 4 and article[-3:] == 'xml']
            article_files.sort()
            print('Loading language model...')
            nlp = load_nlp()
            print('Adding articles to the database...')
            for article_path in article_files:
                articles.append(add_article_to_db(article_path, nlp, source))
        except IOError:
            raise CommandError('Article could not be added. IOError.')

        self.stdout.write(self.style.SUCCESS(f'Successfully added {len(articles)} article(s).'))
Ejemplo n.º 2
0
    def handle(self, *args, **options):

        try:
            print('\nLoading language model...'.ljust(80), end='\r')
            nlp = load_nlp()

            print('Loading cue verbs...'.ljust(80), end='\r')
            with open('data/cue_verbs.csv', 'r') as f:
                reader = csv.reader(f)
                cue_verbs = set(list(reader)[0])

            train_acc, train_acc_lazy, train_p, train_r, train_p_lazy, train_r_lazy =\
                baseline_quote_attribution(nlp, cue_verbs, test=False)
            test_acc, test_acc_lazy, test_p, test_r, test_p_lazy, test_r_lazy = \
                baseline_quote_attribution(nlp, cue_verbs, test=True)

            print('Author Predictions'.ljust(80))
            print('\n  Baseline:')
            print(f'     Training Set Performance')
            print(f'        Accuracy of predicting true Speaker')
            print(f'            {train_acc}')
            print(f'        Results in predicting the set of people cited in the article:\n'
                  f'            Precision: {train_p}\n'
                  f'            Recall:    {train_r}\n'
                  f'            F1:        {2 * train_p * train_r / (train_p + train_r)}\n')
            print(f'     Test Set Performance')
            print(f'        Accuracy of predicting true Speaker')
            print(f'            {test_acc}')
            print(f'        Results in predicting the set of people cited in the article:\n'
                  f'            Precision: {test_p}\n'
                  f'            Recall:    {test_r}\n'
                  f'            F1:        {2 * test_p * test_r / (test_p + test_r)}\n')

            print('\n  Lazy Baseline:')
            print(f'     Training Set Performance')
            print(f'        Accuracy of predicting true Speaker')
            print(f'            {train_acc_lazy}')
            print(f'        Results in predicting the set of people cited in the article:\n'
                  f'            Precision: {train_p_lazy}\n'
                  f'            Recall:    {train_r_lazy}\n'
                  f'            F1:        {2 * train_p_lazy * train_r_lazy / (train_p_lazy + train_r_lazy)}\n')
            print(f'     Test Set Performance')
            print(f'        Accuracy of predicting true Speaker')
            print(f'            {test_acc_lazy}')
            print(f'        Results in predicting the set of people cited in the article:\n'
                  f'            Precision: {test_p_lazy}\n'
                  f'            Recall:    {test_r_lazy}\n'
                  f'            F1:        {2 * test_p_lazy * test_r_lazy / (test_p_lazy + test_r_lazy)}\n')

            print('  ML Author Prediction')
            epochs = options['epochs']
            alpha = options['exp']
            loss = options['loss']
            penalty = options['penalty']
            exp_degree = options['exp']
            _, train_res, test_res, train_authors, test_authors, test_lists =\
                evaluate_author_prediction_test(loss, penalty, alpha, epochs, nlp, cue_verbs, exp_degree)

            print(f'    Training Set Performance')
            print(f'        Binary Classification for NEs\n{train_res.print_average_score()}\n')
            print(f'        Predicting the people cited in an article\n{train_authors.print_average_score()}\n')
            print(f'    Test Set Performance')
            print(f'        Binary Classification for NEs\n{test_res.print_average_score()}\n')
            print(f'        Predicting the people cited in an article\n{test_authors.print_average_score()}\n')
            print(f'        Article by article performance:')
            for i, results in enumerate(test_lists):
                print(f'          Article {i}')
                print(f'            All names: {results["all"]}')
                print(f'            Cited:     {results["cited"]}')
                print(f'            Predicted: {results["predicted"]}')

        except IOError:
            raise CommandError('IO Error.')
Ejemplo n.º 3
0
    def handle(self, *args, **options):
        folds = 5
        if options['folds']:
            folds = options['folds']

        max_epochs = 500
        if options['epochs']:
            max_epochs = options['epochs']

        losses = ['log', 'hinge']
        if options['loss']:
            if options['loss'] == 'log':
                losses = ['log']
            elif options['loss'] == 'hinge':
                losses = ['hinge']

        log_penalties = ['l1', 'l2']
        if options['penalty']:
            if options['penalty'] == 'l1':
                log_penalties = ['l1']
            elif options['penalty'] == 'l2':
                log_penalties = ['l2']

        try:
            with open('logs.txt', 'w') as f:
                f.write(f'Evaluation:\n'
                        f'  {folds}-fold cross validation\n'
                        f'  max epochs: {max_epochs}\n\n')

            print('\nLoading language model...'.ljust(80), end='\r')
            nlp = load_nlp()

            print('Loading cue verbs...'.ljust(80), end='\r')
            with open('data/cue_verbs.csv', 'r') as f:
                reader = csv.reader(f)
                cue_verbs = set(list(reader)[0])

            alphas = [0.01, 0.1]

            print('Evaluating quote detection...'.ljust(80))
            print('\n  Baseline:')
            results = baseline_quote_detection(nlp)
            print(results.print_average_score())
            with open('logs.txt', 'a') as f:
                f.write(
                    f'  Baseline quote detection:\n{results.print_average_score()}\n'
                )

            for l in losses:
                for p in log_penalties:
                    print(f'  {p} {l}:')
                    accumulator = ResultAccumulator()
                    for alpha in alphas:
                        train_res, test_res = evaluate_quote_detection(
                            l, p, alpha, max_epochs, nlp, cue_verbs, folds)
                        with open('logs.txt', 'a') as f:
                            f.write(
                                f'  Quote detection: {p}-{l} loss, alpha={alpha}\n'
                                f'    Training results:\n{train_res.print_average_score()}\n'
                                f'    Test results:\n{test_res.print_average_score()}\n'
                            )
                        accumulator.add_results(train_res, test_res,
                                                f'alpha={alpha}')
                    train, test, name = accumulator.best_model()
                    print(f'      Best results with {name}'.ljust(80))
                    print(
                        f'        Average Training Results\n{train.print_average_score()}\n'
                        f'        Average Test Results\n{test.print_average_score()}\n\n'
                    )

            print('\n\nEvaluating speaker prediction...')
            print('\n  Baseline:')
            acc, acc_lazy, p, r, p_lazy, r_lazy = baseline_quote_attribution(
                nlp, cue_verbs)
            print(
                f'    Given a quote and a list of speakers, accuracy in predicting the true speaker:\n'
                f'        Baseline model:      {acc}\n'
                f'        Lazy Baseline model: {acc_lazy}\n')
            print(
                f'    Results in predicting the set of people cited in the article:\n'
                f'        Baseline model:\n'
                f'            Precision: {p}\n'
                f'            Recall:    {r}\n'
                f'            F1:        {2 * p * r / (p + r)}\n'
                f'        Lazy Baseline model:\n'
                f'            Precision: {p_lazy}\n'
                f'            Recall:    {r_lazy}\n'
                f'            F1:        {2 * p_lazy * r_lazy / (p_lazy + r_lazy)}\n\n'
            )

            print(
                'Speaker prediction: classifying each named entity in a text as either the author of a quote or not'
                ', but not assigning a single named entity to each quote.')
            for l in losses:
                for p in log_penalties:
                    print(f'   Speaker Prediction Results with {p}-{l} loss'.
                          ljust(80))
                    best_train_res = None
                    best_test_res = None
                    best_train_set = None
                    best_test_set = None
                    best_f1 = 0
                    best_alpha = ''
                    for alpha in alphas:
                        exp_degree = options['exp']
                        train_res, test_res, train_set, test_set = evaluate_author_prediction(
                            l, p, alpha, max_epochs, nlp, cue_verbs,
                            exp_degree, folds)

                        acc, pre, rec, f1 = test_set.average_score()
                        if f1 > best_f1:
                            best_f1 = f1
                            best_alpha = alpha
                            best_train_res = train_res
                            best_test_res = test_res
                            best_train_set = train_set
                            best_test_set = test_set

                    print(f'      Best results with alpha={best_alpha}'.ljust(
                        80))
                    print(f'      Performance in predicting each named entity'.
                          ljust(80))
                    print(
                        f'        Average Training Results\n{best_train_res.print_average_score()}\n'
                        f'        Average Test Results\n{best_test_res.print_average_score()}\n\n'
                    )
                    print(f'      Performance in predicting each author name'.
                          ljust(80))
                    print(
                        f'        Average Training Results\n{best_train_set.print_average_score()}\n'
                        f'        Average Test Results\n{best_test_set.print_average_score()}\n\n'
                    )
            """
            print('\n\nEvaluating quote attribution...')

            best_ml_f1 = 0
            best_ml_parameters = ''
            best_ml_train = None
            best_ml_test = None

            # TODO: Should this be removed?
            for ovo in [True, False]:
                if ovo:
                    print('\n  One vs One')
                    extraction_methods = 3
                else:
                    print('\n  One vs All')
                    extraction_methods = 4
                for ext_method in list(range(2, extraction_methods + 1)):
                    for l, p in list(itertools.product(losses, log_penalties)):
                        print(f'   Results with {p}-{l} loss and feature extraction {ext_method}'.ljust(80))
                        best_train = None
                        best_test = None
                        best_f1 = 0
                        best_alpha = ''
                        for alpha in alphas:
                            train_res, test_res = evaluate_quote_attribution(l, p, alpha, ext_method, max_epochs, nlp,
                                                                             cue_verbs, folds, ovo)

                            with open('logs.txt', 'a') as f:
                                f.write(f'  Quote attribution: one vs one: {ovo}, {ext_method}-feature extraction,'
                                        f' {p}-{l} loss, alpha={alpha}\n'
                                        f'{pretty_print_string(train_res, test_res)}\n')

                            if test_res['f1'] > best_f1:
                                best_f1 = test_res['f1']
                                best_alpha = alpha
                                best_train = train_res
                                best_test = test_res

                            if test_res['f1'] > best_ml_f1:
                                best_ml_f1 = test_res['f1']
                                best_ml_parameters = f'{p}-{l} loss, feature extraction {ext_method}, alpha={alpha}'
                                best_ml_train = train_res
                                best_ml_test = test_res

                        print(f'    Best results with alpha={best_alpha}'.ljust(80))
                        print(pretty_print_string(best_train, best_test))
                        print('\n\n')

            print(f'\n\n    Best results from a machine learning model: {best_ml_parameters}')
            print(pretty_print_string(best_ml_train, best_ml_test))
            

            with open('logs.txt', 'a') as f:
                f.write(f'  Best results for quote attribution: {best_ml_parameters}\n'
                        f'{pretty_print_string(best_ml_train, best_ml_test)}')
            """

        except IOError:
            raise CommandError('IO Error.')
Ejemplo n.º 4
0
    :param request: HTTP GET Request
        The user request. Must contain a 'key' parameter with the correct value for the user to become an admin.
    :return: JsonResponse
        A Json containing the key 'Success', with value True if the user became an admin and false otherwise.
        """
    # Get user secret key
    data = dict(request.GET)
    secret_key = data['key'][0]
    if secret_key == ADMIN_SECRET_KEY:
        request.session['admin'] = True
        return JsonResponse({'Success': True})

    return JsonResponse({'Success': False})


nlp = load_nlp()
detector = gender_detector.Detector()


class GetCounts(APIView):
    def post(self, request):
        with open('data/cue_verbs.csv', 'r') as f:
            reader = csv.reader(f)
            cue_verbs = set(list(reader)[0])

        template = """<?xml version='1.0' encoding='utf-8'?>
            <article>
               <titre></titre>
               <p>{}</p>
            </article>"""
Ejemplo n.º 5
0
    def handle(self, *args, **options):
        max_epochs = options['epochs']
        qd_loss = options['qd_loss']
        qd_penalty = options['qd_penalty']
        qd_alpha = options['qd_reg']

        ap_loss = options['ap_loss']
        ap_penalty = options['ap_penalty']
        ap_alpha = options['ap_reg']

        try:
            print('\nLoading language model...\n')
            nlp = load_nlp()
            with open('data/cue_verbs.csv', 'r') as f:
                reader = csv.reader(f)
                cue_verbs = set(list(reader)[0])

            print('Training quote detection...')
            qd_ed = quote_detection_poly_degree
            qd_trained_model = train_quote_detection(qd_loss, qd_penalty,
                                                     qd_alpha, max_epochs, nlp,
                                                     cue_verbs, qd_ed)
            save_model(qd_trained_model, path_quote_detection_weights)
            print(f'Saved trained model at {path_quote_detection_weights}\n')

            print("Training author prediction...")
            ap_ed = author_prediction_poly_degree
            ap_trained_model, _, _, _, _, _ =\
                evaluate_author_prediction_test(ap_loss, ap_penalty, ap_alpha, max_epochs, nlp, cue_verbs, ap_ed)
            save_model(ap_trained_model, path_author_attribution_weights)
            print(f'Saved trained model at {path_quote_detection_weights}\n')

            print('Evaluating all unlabeled quotes...')
            proba = qd_loss == 'log'
            articles, sentences, in_quotes = load_unlabeled_sentences(nlp)
            max_hinge_value = 0.00001
            confidences = []
            predictions = []
            for article, article_sentences, article_in_quotes in zip(
                    articles, sentences, in_quotes):
                probabilities = evaluate_unlabeled_sentences(qd_trained_model,
                                                             article_sentences,
                                                             cue_verbs,
                                                             article_in_quotes,
                                                             proba=proba)
                if proba:
                    # Map the probability that a sentence is a quote to a confidence:
                    #   * probability is 0.5: model has no clue, confidence 0
                    #   * probability is 0 or 1: model knows, confidence 1
                    confidence = [
                        2 * abs(0.5 - prob) for prob in probabilities
                    ]
                    confidences.append(confidence)
                    prediction = [round(prob) for prob in probabilities]
                    predictions.append(prediction)
                else:
                    # When using hinge loss, the confidence is the distance to the seperating hyperplane
                    # Take the log to reduce the effect of very large values
                    confidence = [np.log(abs(prob)) for prob in probabilities]
                    confidences.append(confidence)
                    prediction = [int(prob > 0) for prob in probabilities]
                    predictions.append(prediction)
                    max_hinge_value = max(max_hinge_value, max(confidence))

            for article, confidence, prediction in zip(articles, confidences,
                                                       predictions):
                if not proba:
                    confidence = [
                        conf / max_hinge_value for conf in confidence
                    ]
                # For sentences in the article that are fully labeled, the confidence is 1
                new_confidences = [
                    max(label, conf) for label, conf in zip(
                        article.labeled['labeled'], confidence)
                ]
                change_confidence(article.id, new_confidences, prediction)

            print('Done\n')

        except IOError:
            raise
            raise CommandError('IO Error.')