def parse_article(article, sentences, cue_verbs, poly=None): """ Creates feature vectors for each sentence in the article from the raw data. :param article: models.Article A fully labeled article for which to create features. :param sentences: list(spaCy.Doc) the spaCy.Doc for each sentence in the article. :param cue_verbs: list(string) The list of all "cue verbs", which are verbs that often introduce reported speech. :param poly: sklearn.preprocessing.PolynomialFeatures If defined, used for feature expansion. """ article_features = [] article_labels = [] sentence_start = 0 for sentence_index, end in enumerate(article.sentences['sentences']): # Compute sentence features sentence = sentences[sentence_index] in_quotes = article.in_quotes['in_quotes'][sentence_start:end + 1] features = feature_extraction(sentence, cue_verbs, in_quotes) if poly: features = poly.fit_transform(features.reshape((-1, 1))).reshape( (-1, )) # Compute sentence label sentence_labels, sentence_authors, _ = aggregate_label( article, sentence_index) label = int(sum(sentence_labels) > 0) # Adds the sentence and label to the dataset article_features.append(features) article_labels.append(label) sentence_start = end + 1 return article_features, article_labels
def load_sentence_labels(nlp): """ Finds all fully labeled articles. Extracts all sentences from each article, as well as a label for each sentence: 0 if it doesn't contain reported speech, and 1 if it does. Assigns sentences in newly labeled articles to either the training or test set. Each sentence is added to the test set with 10% probability. :param nlp: spaCy.Language The language model used to tokenize the text. :return: list(string), list(int), list(list(int)), list(string), list(int), list(list(int)) * the list of all training sentences * the list of all training labels * the list of in_quote values for each training sentence * the list of all testing sentences * the list of all testing labels * the list of in_quote values for each test sentence """ articles = Article.objects.filter(labeled__fully_labeled=1) train_sentences = [] train_labels = [] train_in_quotes = [] test_sentences = [] test_labels = [] test_in_quotes = [] for article in articles: start = 0 # Check if the article already has its sentences assigned to the test or training set. if 'test_set' not in article.labeled: article.labeled['test_set'] = int(np.random.random() > 0.9) article.save() # The spaCy.Doc object for each sentence in the article. article_sentence_docs = extract_sentence_spans(article.text, nlp) # The in_quotes list for each sentence in the article article_in_quotes = [] # The label for each sentence in the article article_labels = [] for sentence_index, end in enumerate(article.sentences['sentences']): # Extract the in_quote list for the sentence in_quotes = article.in_quotes['in_quotes'][start:end + 1] article_in_quotes.append(in_quotes) # Compute consensus labels sentence_labels, sentence_authors, _ = aggregate_label( article, sentence_index) article_labels.append(int(sum(sentence_labels) > 0)) start = end + 1 # Adds the data to the correct list if article.labeled['test_set'] == 0: train_sentences += article_sentence_docs train_in_quotes += article_in_quotes train_labels += article_labels else: test_sentences += article_sentence_docs test_in_quotes += article_in_quotes test_labels += article_labels return train_sentences, train_labels, train_in_quotes, test_sentences, test_labels, test_in_quotes
def load_quote_authors(nlp): """ Finds all sentences containing quotes, and the author of each quote. :param nlp: spaCy.Language The language model used to tokenize the text. :return: list(dict), list(dict) Lists of dicts containing training and test quotes, respectively. Keys: * 'article': models.Article, the article containing the quote * 'sentences': list(spaCy.Doc), the spaCy.Doc for each sentence in the article. * 'quotes': list(int), the indices of sentences that contain quotes in the article. * 'author': list(list(int)), the indices of the tokens of the author of the quote. """ articles = Article.objects.filter(labeled__fully_labeled=1) # list of training articles train_articles = [] # list of test quotes test_articles = [] for article in articles: # Check if the article already has its sentences assigned to the test or training set. if 'test_set' not in article.labeled: article.labeled['test_set'] = int(np.random.random() > 0.9) article.save() # The spaCy.Doc object for each sentence in the article. article_sentence_docs = extract_sentence_spans(article.text, nlp) quotes = [] authors = [] for sentence_index, end in enumerate(article.sentences['sentences']): # Compute consensus labels sentence_labels, sentence_authors, _ = aggregate_label( article, sentence_index) # Check if the sentence contains reported speech. If it does, add to the training or test set. if int(sum(sentence_labels) > 0): quotes.append(sentence_index) authors.append(sentence_authors) if len(quotes) > 0: article_quotes = { 'article': article, 'sentences': article_sentence_docs, 'quotes': quotes, 'authors': authors, } if article.labeled['test_set'] == 0: train_articles.append(article_quotes) else: test_articles.append(article_quotes) return train_articles, test_articles
def handle(self, *args, **options): path = options['path'] try: articles = Article.objects.all() for a in articles: if a.labeled['fully_labeled'] == 1: labels = [] authors = [] for s_id in range(len(a.sentences['sentences'])): sent_label, sent_authors, consensus = aggregate_label( a, s_id) labels.append(sent_label) authors.append(sent_authors) output_xml = database_to_xml(a, labels, authors) with open(f'{path}/article_{a.id}.xml', 'w') as f: f.write(output_xml) except IOError: raise CommandError('Articles could not be extracted. IO Error.')
def baseline_quote_detection(nlp): """ Evaluates the baseline model for quote detection. :param nlp: spaCy.Language The language model used to tokenize the text. :return: QuoteDetectionDataset The dataset that was used for quote detection. """ y = [] y_pred = [] train_articles, train_sentences, _, _ = load_labeled_articles(nlp) for index, article in enumerate(train_articles): article_sentences = train_sentences[index] sentence_start = 0 for sentence_index, end in enumerate(article.sentences['sentences']): sentence_labels, sentence_authors, _ = aggregate_label( article, sentence_index) true_value = int(sum(sentence_labels) > 0) y.append(true_value) in_quotes = article.in_quotes['in_quotes'][sentence_start:end + 1] prediction = predict_sentence(article_sentences[sentence_index], in_quotes) y_pred.append(prediction) sentence_start = end + 1 accuracy = np.sum(np.equal(y, y_pred)) / len(y) precision, recall, f1, _ = precision_recall_fscore_support( y, y_pred, zero_division=0, average='binary') scores = Results() scores.add_scores({ 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, }) return scores
def handle(self, *args, **options): # Number of total articles for each source articles_count = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0} # Number of labeled articles for each source labeled_articles = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0} # Number of labeled articles for each source train_articles = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0} # Number of labeled articles for each source test_articles = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0} # Number of labeled sentences for each source labeled_sentences = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0} # Number of total sentences for each source sentences_count = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0} # Number of labeled sentences that are quotes for each source labeled_quotes = {'Heidi.News': 0, 'Parisien': 0, 'Republique': 0} articles = Article.objects.all() for a in articles: articles_count[a.source] += 1 sentences_count[a.source] += len(a.sentences['sentences']) if a.labeled['fully_labeled'] == 1: labeled_articles[a.source] += 1 # Check if the article is in the training or test set. if 'test_set' not in a.labeled: a.labeled['test_set'] = int(random() > 0.9) a.save() if a.labeled['test_set'] == 0: train_articles[a.source] += 1 else: test_articles[a.source] += 1 # Check if each sentence is a quote or not. for sentence_index, end in enumerate(a.sentences['sentences']): labeled_sentences[a.source] += 1 # Compute consensus labels sentence_labels, sentence_authors, _ = aggregate_label( a, sentence_index) # If the sentence is a quote, add the number of quotes to the source if sum(sentence_labels) > 0: labeled_quotes[a.source] += 1 def form_string(base_string, article_source): return base_string.format( source, f'{labeled_articles[article_source]}/{articles_count[article_source]}', f'{train_articles[article_source]}/{labeled_articles[article_source]}', f'{test_articles[article_source]}/{labeled_articles[article_source]}', f'{labeled_sentences[article_source]}/{sentences_count[article_source]}', f'{labeled_quotes[article_source]}/?') base = '{:^15} | {:^15} | {:^15} | {:^15} | {:^15} | {:^15}' print('\n') print( base.format('Source', 'Articles', 'Train', 'Test', 'Sentences', 'Quotes')) print(f'{110 * "-"}') for source in labeled_articles.keys(): print(form_string(base, source)) all_articles = reduce(lambda x, y: x + y, articles_count.values()) labeled_articles = reduce(lambda x, y: x + y, labeled_articles.values()) training_articles = reduce(lambda x, y: x + y, train_articles.values()) testing_articles = reduce(lambda x, y: x + y, test_articles.values()) all_sentences = reduce(lambda x, y: x + y, sentences_count.values()) labeled_sentences = reduce(lambda x, y: x + y, labeled_sentences.values()) all_quotes = reduce(lambda x, y: x + y, labeled_quotes.values()) print(base.format('', '', '', '', '', '')) print( base.format('Total', f'{labeled_articles}/{all_articles}', f'{training_articles}/{labeled_articles}', f'{testing_articles}/{labeled_articles}', f'{labeled_sentences}/{all_sentences}', f'{all_quotes}/?')) print('\n')