コード例 #1
0
def parse_article(article, sentences, cue_verbs, poly=None):
    """
    Creates feature vectors for each sentence in the article from the raw data.

    :param article: models.Article
        A fully labeled article for which to create features.
    :param sentences: list(spaCy.Doc)
        the spaCy.Doc for each sentence in the article.
    :param cue_verbs: list(string)
        The list of all "cue verbs", which are verbs that often introduce reported speech.
    :param poly: sklearn.preprocessing.PolynomialFeatures
        If defined, used for feature expansion.
    """
    article_features = []
    article_labels = []
    sentence_start = 0
    for sentence_index, end in enumerate(article.sentences['sentences']):
        # Compute sentence features
        sentence = sentences[sentence_index]
        in_quotes = article.in_quotes['in_quotes'][sentence_start:end + 1]
        features = feature_extraction(sentence, cue_verbs, in_quotes)
        if poly:
            features = poly.fit_transform(features.reshape((-1, 1))).reshape(
                (-1, ))
        # Compute sentence label
        sentence_labels, sentence_authors, _ = aggregate_label(
            article, sentence_index)
        label = int(sum(sentence_labels) > 0)
        # Adds the sentence and label to the dataset
        article_features.append(features)
        article_labels.append(label)
        sentence_start = end + 1

    return article_features, article_labels
コード例 #2
0
def load_sentence_labels(nlp):
    """
    Finds all fully labeled articles. Extracts all sentences from each article, as well as a label for each sentence: 0
    if it doesn't contain reported speech, and 1 if it does. Assigns sentences in newly labeled articles to either the
    training or test set. Each sentence is added to the test set with 10% probability.

    :param nlp: spaCy.Language
        The language model used to tokenize the text.
    :return: list(string), list(int), list(list(int)), list(string), list(int), list(list(int))
        * the list of all training sentences
        * the list of all training labels
        * the list of in_quote values for each training sentence
        * the list of all testing sentences
        * the list of all testing labels
        * the list of in_quote values for each test sentence
    """
    articles = Article.objects.filter(labeled__fully_labeled=1)
    train_sentences = []
    train_labels = []
    train_in_quotes = []
    test_sentences = []
    test_labels = []
    test_in_quotes = []
    for article in articles:
        start = 0
        # Check if the article already has its sentences assigned to the test or training set.
        if 'test_set' not in article.labeled:
            article.labeled['test_set'] = int(np.random.random() > 0.9)
            article.save()
        # The spaCy.Doc object for each sentence in the article.
        article_sentence_docs = extract_sentence_spans(article.text, nlp)
        # The in_quotes list for each sentence in the article
        article_in_quotes = []
        # The label for each sentence in the article
        article_labels = []
        for sentence_index, end in enumerate(article.sentences['sentences']):
            # Extract the in_quote list for the sentence
            in_quotes = article.in_quotes['in_quotes'][start:end + 1]
            article_in_quotes.append(in_quotes)
            # Compute consensus labels
            sentence_labels, sentence_authors, _ = aggregate_label(
                article, sentence_index)
            article_labels.append(int(sum(sentence_labels) > 0))
            start = end + 1

        # Adds the data to the correct list
        if article.labeled['test_set'] == 0:
            train_sentences += article_sentence_docs
            train_in_quotes += article_in_quotes
            train_labels += article_labels
        else:
            test_sentences += article_sentence_docs
            test_in_quotes += article_in_quotes
            test_labels += article_labels

    return train_sentences, train_labels, train_in_quotes, test_sentences, test_labels, test_in_quotes
コード例 #3
0
def load_quote_authors(nlp):
    """
    Finds all sentences containing quotes, and the author of each quote.

    :param nlp: spaCy.Language
        The language model used to tokenize the text.
    :return: list(dict), list(dict)
        Lists of dicts containing training and test quotes, respectively. Keys:
            * 'article': models.Article, the article containing the quote
            * 'sentences': list(spaCy.Doc), the spaCy.Doc for each sentence in the article.
            * 'quotes': list(int), the indices of sentences that contain quotes in the article.
            * 'author': list(list(int)), the indices of the tokens of the author of the quote.
    """
    articles = Article.objects.filter(labeled__fully_labeled=1)
    # list of training articles
    train_articles = []
    # list of test quotes
    test_articles = []
    for article in articles:
        # Check if the article already has its sentences assigned to the test or training set.
        if 'test_set' not in article.labeled:
            article.labeled['test_set'] = int(np.random.random() > 0.9)
            article.save()
        # The spaCy.Doc object for each sentence in the article.
        article_sentence_docs = extract_sentence_spans(article.text, nlp)
        quotes = []
        authors = []
        for sentence_index, end in enumerate(article.sentences['sentences']):
            # Compute consensus labels
            sentence_labels, sentence_authors, _ = aggregate_label(
                article, sentence_index)
            # Check if the sentence contains reported speech. If it does, add to the training or test set.
            if int(sum(sentence_labels) > 0):
                quotes.append(sentence_index)
                authors.append(sentence_authors)

        if len(quotes) > 0:
            article_quotes = {
                'article': article,
                'sentences': article_sentence_docs,
                'quotes': quotes,
                'authors': authors,
            }
            if article.labeled['test_set'] == 0:
                train_articles.append(article_quotes)
            else:
                test_articles.append(article_quotes)

    return train_articles, test_articles
コード例 #4
0
    def handle(self, *args, **options):
        path = options['path']
        try:
            articles = Article.objects.all()
            for a in articles:
                if a.labeled['fully_labeled'] == 1:
                    labels = []
                    authors = []
                    for s_id in range(len(a.sentences['sentences'])):
                        sent_label, sent_authors, consensus = aggregate_label(
                            a, s_id)
                        labels.append(sent_label)
                        authors.append(sent_authors)
                    output_xml = database_to_xml(a, labels, authors)
                    with open(f'{path}/article_{a.id}.xml', 'w') as f:
                        f.write(output_xml)

        except IOError:
            raise CommandError('Articles could not be extracted. IO Error.')
コード例 #5
0
def baseline_quote_detection(nlp):
    """
    Evaluates the baseline model for quote detection.

    :param nlp: spaCy.Language
        The language model used to tokenize the text.
    :return: QuoteDetectionDataset
        The dataset that was used for quote detection.
    """
    y = []
    y_pred = []
    train_articles, train_sentences, _, _ = load_labeled_articles(nlp)
    for index, article in enumerate(train_articles):
        article_sentences = train_sentences[index]
        sentence_start = 0
        for sentence_index, end in enumerate(article.sentences['sentences']):
            sentence_labels, sentence_authors, _ = aggregate_label(
                article, sentence_index)
            true_value = int(sum(sentence_labels) > 0)
            y.append(true_value)

            in_quotes = article.in_quotes['in_quotes'][sentence_start:end + 1]
            prediction = predict_sentence(article_sentences[sentence_index],
                                          in_quotes)
            y_pred.append(prediction)

            sentence_start = end + 1

    accuracy = np.sum(np.equal(y, y_pred)) / len(y)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y, y_pred, zero_division=0, average='binary')
    scores = Results()
    scores.add_scores({
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    })
    return scores
コード例 #6
0
    def handle(self, *args, **options):
        # Number of total articles for each source
        articles_count = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0}
        # Number of labeled articles for each source
        labeled_articles = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0}
        # Number of labeled articles for each source
        train_articles = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0}
        # Number of labeled articles for each source
        test_articles = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0}
        # Number of labeled sentences for each source
        labeled_sentences = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0}
        # Number of total sentences for each source
        sentences_count = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0}
        # Number of labeled sentences that are quotes for each source
        labeled_quotes = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0}

        articles = Article.objects.all()
        for a in articles:
            articles_count[a.source] += 1
            sentences_count[a.source] += len(a.sentences['sentences'])
            if a.labeled['fully_labeled'] == 1:
                labeled_articles[a.source] += 1

                # Check if the article is in the training or test set.
                if 'test_set' not in a.labeled:
                    a.labeled['test_set'] = int(random() > 0.9)
                    a.save()
                if a.labeled['test_set'] == 0:
                    train_articles[a.source] += 1
                else:
                    test_articles[a.source] += 1

                # Check if each sentence is a quote or not.
                for sentence_index, end in enumerate(a.sentences['sentences']):
                    labeled_sentences[a.source] += 1
                    # Compute consensus labels
                    sentence_labels, sentence_authors, _ = aggregate_label(
                        a, sentence_index)
                    # If the sentence is a quote, add the number of quotes to the source
                    if sum(sentence_labels) > 0:
                        labeled_quotes[a.source] += 1

        def form_string(base_string, article_source):
            return base_string.format(
                source,
                f'{labeled_articles[article_source]}/{articles_count[article_source]}',
                f'{train_articles[article_source]}/{labeled_articles[article_source]}',
                f'{test_articles[article_source]}/{labeled_articles[article_source]}',
                f'{labeled_sentences[article_source]}/{sentences_count[article_source]}',
                f'{labeled_quotes[article_source]}/?')

        base = '{:^15} | {:^15} | {:^15} | {:^15} | {:^15} | {:^15}'

        print('\n')
        print(
            base.format('Source', 'Articles', 'Train', 'Test', 'Sentences',
                        'Quotes'))
        print(f'{110 * "-"}')
        for source in labeled_articles.keys():
            print(form_string(base, source))

        all_articles = reduce(lambda x, y: x + y, articles_count.values())
        labeled_articles = reduce(lambda x, y: x + y,
                                  labeled_articles.values())
        training_articles = reduce(lambda x, y: x + y, train_articles.values())
        testing_articles = reduce(lambda x, y: x + y, test_articles.values())
        all_sentences = reduce(lambda x, y: x + y, sentences_count.values())
        labeled_sentences = reduce(lambda x, y: x + y,
                                   labeled_sentences.values())
        all_quotes = reduce(lambda x, y: x + y, labeled_quotes.values())
        print(base.format('', '', '', '', '', ''))
        print(
            base.format('Total', f'{labeled_articles}/{all_articles}',
                        f'{training_articles}/{labeled_articles}',
                        f'{testing_articles}/{labeled_articles}',
                        f'{labeled_sentences}/{all_sentences}',
                        f'{all_quotes}/?'))
        print('\n')