Ejemplo n.º 1
0
    def test_get_df_bible(self):
        with patch("src.dataloader.pd") as mock_pd:
            mock_pd.read_csv.return_value = pd.read_csv(
                "test/csv_test/test_bible_4_verse.csv")

            df_bible = pd.read_csv("test/csv_test/test_bible_4_verse.csv")
            df_bible.drop(["Unnamed: 0"], axis=1, inplace=True)

            assert_frame_equal(get_df_bible(), df_bible)
Ejemplo n.º 2
0
def loadBible(testament):
    # return the part of the bible desired
    df_bible = dataloader.get_df_bible()
    if testament == "new":
        _, df_bible = dataloader.get_old_new_testament(df_bible)
    elif testament == "old":
        df_bible, _ = dataloader.get_old_new_testament(df_bible)
    else:
        print("testament not recognized, continued with the hole bible")
    return df_bible
Ejemplo n.º 3
0
def extract_characters(df=None,
                       rule_based_matching=True,
                       use_bible_patterns=True,
                       patterns=None,
                       export_csv=True,
                       csv_name="src/csv/bibleTA_characters.csv"):
    '''Using Named Entitiy Recognition from spacy to extract character Names from given text.

    :type df: pandas.core.frame:DataFrame
    :param df: DataFrame containing a text column
    :type rule_based_matching: bool
    :param rule_based_matching: use spacy's EntityRuler for additional matching rules. Can only be used when coming either with "patterns" or "use_bible_patterns"
    :type use_bible_patterns: bool
    :param use_bible_patterns: patterns containing the names of most bible characters including synonyms. "rule_based_matching" must be True
    :type patterns: list
    :param patterns: patterns for EntiyRuler. "rule_based_matching" must be True
    :type export_csv: bool
    :param export_csv: save df as csv on disk
    :type csv_name: string
    :param csv_name: file name for csv output
    :return: df with "characters" column
    :rtype: pandas.core.frame.DataFrame

    '''
    if df is None:
        df = dataloader.get_df_bible()
    nlp = en_core_web_sm.load()

    if rule_based_matching and use_bible_patterns:
        ruler = EntityRuler(
            nlp, validate=True,
            overwrite_ents=True).from_disk("src/patterns.jsonl")
        nlp.add_pipe(ruler)
    elif rule_based_matching and patterns is not None:
        ruler = EntityRuler(nlp)
        ruler.add_patterns(patterns)
        nlp.add_pipe(ruler)

    add_character_column(df, nlp)

    if export_csv:
        df.to_csv(csv_name, encoding="utf-8", index=False)

    return df
Ejemplo n.º 4
0
def get_text_for_chapter(chapter, book_id):
    """Selects text relevant for chapter

    This function selects rows of a DataFrame (the bible data frame) for specified chapter and
    re-formats them as concatenated string for further use by the keyword extraction algorithm.

    :type chapter: int
    :type book_id: str
    :param chapter: the chapter for which text should be selected, e. g. 1
    :param book_id: the book in which this chapter is contained, e. g. Gen (Genesis)
    :return: the text values of the selected rows as a concatenated string
    :rtype: str
    """
    bible_df = dataloader.get_df_bible()
    ch_df = bible_df.loc[(bible_df["chapter"] == chapter)
                         & (bible_df["book_id"] == book_id)]
    ch_texts = ch_df["text"].tolist()
    text = " ".join(ch_texts)
    return text
Ejemplo n.º 5
0
def main(testament="both"):
    os.makedirs("src/csv", exist_ok=True)
    df_bible = None
    # check if a dataframe is given or start setting up on by preprocess_emotion.main()
    if exists("src/csv/bibleTA_prepro.csv") == False:
        df_bible = preprocess_emotion.main("both", None,
                                           "csv/bibleTA_prepro.csv")
    else:
        df_bible = pd.read_csv("src/csv/bibleTA_prepro.csv")

    if exists("src/csv/bibleTA_characters.csv") == False:
        df_raw_bible = dataloader.get_df_bible()
        df_resolved = character_extractor.coreference_resolution(df_raw_bible)
        df_characters = character_extractor.extract_characters(df_resolved)
    else:
        df_characters = pd.read_csv("src/csv/bibleTA_characters.csv")

    # unite two dataframes since the calculation of "bibleTA_prepro.csv" takes a substancial amount of time
    df_bible = join_df.main(character_csv="src/csv/bibleTA_characters.csv",
                            relation_csv="src/csv/bibleTA_prepro.csv",
                            out_csv="src/csv/bibleTA_emotion.csv",
                            df_bible=df_bible,
                            df_characters=df_characters)

    # if a specific testament is given, reduce dataframe
    if testament == "new":
        _, df_bible = dataloader.get_old_new_testament(df_bible)
    if testament == "old":
        df_bible, _ = dataloader.get_old_new_testament(df_bible)
    top = pd.DataFrame()
    columns = df_bible.columns.tolist()[1:]

    # determine the graph, create pickle objects and run a clustering of keywords
    eval_graph.main(threshold_getgraph=5,
                    num_cluster=4,
                    threshold_getcluster=(1 / 6),
                    file="src/csv/bibleTA_emotion.csv",
                    load=True,
                    df_bible=df_bible)
from unittest import TestCase
import pandas as pd
from pandas._testing import assert_frame_equal
import spacy
import os
import en_core_web_sm

import src.character_extractor as character_extractor
import src.dataloader as dataloader

# Functions that are not tested here, are being called by at least one of the tested functions

df_bible = dataloader.get_df_bible()
df_test = df_bible.head(42)


class TestUserFunctions(TestCase):
    def test_coreference_resolution_on_different_parameters(self):
        df_resolved = character_extractor.coreference_resolution(df_test)
        df_resolved_start_20 = character_extractor.coreference_resolution(
            df_test, start_line=20)
        df_resolved_invalid_1 = character_extractor.coreference_resolution(
            df_test, start_line=20, end_line=10)
        df_resolved_invalid_2 = character_extractor.coreference_resolution(
            df_test, start_line=50)
        df_resolved_invalid_3 = character_extractor.coreference_resolution(
            df_test, end_line=50)

        self.assertEqual(df_resolved.columns, df_resolved_start_20.columns)
        self.assertEqual(df_resolved.columns, df_resolved_invalid_1.columns)
Ejemplo n.º 7
0
def get_text_for_character(character_name):
    """Selects text relevant for character

    This function selects rows of a DataFrame (the bible data frame) for specified character by choosing those verses
    in which the character's name appears. Then, it reduces or expands this relevant context to 20 words -
    the ten words before and after the character's name.
    It does so by looking up the previous/following verse by index in the bible data frame.

    :type character_name: str
    :param character_name: the character for which verses should be selected, e. g. "Jesus"
    :return: the text values of the selected rows as a concatenated string
    :rtype: str
    """
    bible_df = dataloader.get_df_bible()
    character_df = bible_df[bible_df["text"].str.contains(character_name)]
    character_verses = dict(zip(
        character_df.index,
        character_df.text))  # FIXME in order to not iterate over df - good?
    character_texts = []

    for i, verse in character_verses.items():
        split_verse = verse.split(
            character_name
        )  # "He was Jesus and Jesus was good" -> ["He was", "and", "was good"]
        for index, verse_part in enumerate(split_verse[:-1]):
            context_before = character_name.join(
                split_verse[:index +
                            1])  # index=1: "Jesus".join(["He was", "and"])
            context_size_before = len(context_before.split(" "))
            context_after = character_name.join(
                split_verse[index +
                            1:])  # index=0: "Jesus".join(["and", "was good"])
            context_size_after = len(context_after.split(" "))

            added_verses = 0
            while context_size_before < 10:
                added_verses += 1
                if i - added_verses > 0:
                    context_before = (bible_df.loc[i - added_verses, "text"] +
                                      " " + context_before)
                    context_size_before = len(context_before.split(" "))
                else:
                    break

            added_verses = 0
            while context_size_after < 10:
                added_verses += 1
                if i + added_verses < len(bible_df.index):
                    context_after += " " + bible_df.loc[i + added_verses,
                                                        "text"]
                    context_size_after = len(context_after.split(" "))
                else:
                    break

            if context_size_before > 10:
                context_before = " ".join(context_before.split(" ")[:10])
            if context_size_after > 10:
                context_after = " ".join(context_after.split(" ")[:10])

            whole_context = context_before + " " + character_name + " " + context_after
            character_texts.append(whole_context)

    text = " ".join(character_texts)
    return text