def test_get_df_bible(self): with patch("src.dataloader.pd") as mock_pd: mock_pd.read_csv.return_value = pd.read_csv( "test/csv_test/test_bible_4_verse.csv") df_bible = pd.read_csv("test/csv_test/test_bible_4_verse.csv") df_bible.drop(["Unnamed: 0"], axis=1, inplace=True) assert_frame_equal(get_df_bible(), df_bible)
def loadBible(testament): # return the part of the bible desired df_bible = dataloader.get_df_bible() if testament == "new": _, df_bible = dataloader.get_old_new_testament(df_bible) elif testament == "old": df_bible, _ = dataloader.get_old_new_testament(df_bible) else: print("testament not recognized, continued with the hole bible") return df_bible
def extract_characters(df=None, rule_based_matching=True, use_bible_patterns=True, patterns=None, export_csv=True, csv_name="src/csv/bibleTA_characters.csv"): '''Using Named Entitiy Recognition from spacy to extract character Names from given text. :type df: pandas.core.frame:DataFrame :param df: DataFrame containing a text column :type rule_based_matching: bool :param rule_based_matching: use spacy's EntityRuler for additional matching rules. Can only be used when coming either with "patterns" or "use_bible_patterns" :type use_bible_patterns: bool :param use_bible_patterns: patterns containing the names of most bible characters including synonyms. "rule_based_matching" must be True :type patterns: list :param patterns: patterns for EntiyRuler. "rule_based_matching" must be True :type export_csv: bool :param export_csv: save df as csv on disk :type csv_name: string :param csv_name: file name for csv output :return: df with "characters" column :rtype: pandas.core.frame.DataFrame ''' if df is None: df = dataloader.get_df_bible() nlp = en_core_web_sm.load() if rule_based_matching and use_bible_patterns: ruler = EntityRuler( nlp, validate=True, overwrite_ents=True).from_disk("src/patterns.jsonl") nlp.add_pipe(ruler) elif rule_based_matching and patterns is not None: ruler = EntityRuler(nlp) ruler.add_patterns(patterns) nlp.add_pipe(ruler) add_character_column(df, nlp) if export_csv: df.to_csv(csv_name, encoding="utf-8", index=False) return df
def get_text_for_chapter(chapter, book_id): """Selects text relevant for chapter This function selects rows of a DataFrame (the bible data frame) for specified chapter and re-formats them as concatenated string for further use by the keyword extraction algorithm. :type chapter: int :type book_id: str :param chapter: the chapter for which text should be selected, e. g. 1 :param book_id: the book in which this chapter is contained, e. g. Gen (Genesis) :return: the text values of the selected rows as a concatenated string :rtype: str """ bible_df = dataloader.get_df_bible() ch_df = bible_df.loc[(bible_df["chapter"] == chapter) & (bible_df["book_id"] == book_id)] ch_texts = ch_df["text"].tolist() text = " ".join(ch_texts) return text
def main(testament="both"): os.makedirs("src/csv", exist_ok=True) df_bible = None # check if a dataframe is given or start setting up on by preprocess_emotion.main() if exists("src/csv/bibleTA_prepro.csv") == False: df_bible = preprocess_emotion.main("both", None, "csv/bibleTA_prepro.csv") else: df_bible = pd.read_csv("src/csv/bibleTA_prepro.csv") if exists("src/csv/bibleTA_characters.csv") == False: df_raw_bible = dataloader.get_df_bible() df_resolved = character_extractor.coreference_resolution(df_raw_bible) df_characters = character_extractor.extract_characters(df_resolved) else: df_characters = pd.read_csv("src/csv/bibleTA_characters.csv") # unite two dataframes since the calculation of "bibleTA_prepro.csv" takes a substancial amount of time df_bible = join_df.main(character_csv="src/csv/bibleTA_characters.csv", relation_csv="src/csv/bibleTA_prepro.csv", out_csv="src/csv/bibleTA_emotion.csv", df_bible=df_bible, df_characters=df_characters) # if a specific testament is given, reduce dataframe if testament == "new": _, df_bible = dataloader.get_old_new_testament(df_bible) if testament == "old": df_bible, _ = dataloader.get_old_new_testament(df_bible) top = pd.DataFrame() columns = df_bible.columns.tolist()[1:] # determine the graph, create pickle objects and run a clustering of keywords eval_graph.main(threshold_getgraph=5, num_cluster=4, threshold_getcluster=(1 / 6), file="src/csv/bibleTA_emotion.csv", load=True, df_bible=df_bible)
from unittest import TestCase import pandas as pd from pandas._testing import assert_frame_equal import spacy import os import en_core_web_sm import src.character_extractor as character_extractor import src.dataloader as dataloader # Functions that are not tested here, are being called by at least one of the tested functions df_bible = dataloader.get_df_bible() df_test = df_bible.head(42) class TestUserFunctions(TestCase): def test_coreference_resolution_on_different_parameters(self): df_resolved = character_extractor.coreference_resolution(df_test) df_resolved_start_20 = character_extractor.coreference_resolution( df_test, start_line=20) df_resolved_invalid_1 = character_extractor.coreference_resolution( df_test, start_line=20, end_line=10) df_resolved_invalid_2 = character_extractor.coreference_resolution( df_test, start_line=50) df_resolved_invalid_3 = character_extractor.coreference_resolution( df_test, end_line=50) self.assertEqual(df_resolved.columns, df_resolved_start_20.columns) self.assertEqual(df_resolved.columns, df_resolved_invalid_1.columns)
def get_text_for_character(character_name): """Selects text relevant for character This function selects rows of a DataFrame (the bible data frame) for specified character by choosing those verses in which the character's name appears. Then, it reduces or expands this relevant context to 20 words - the ten words before and after the character's name. It does so by looking up the previous/following verse by index in the bible data frame. :type character_name: str :param character_name: the character for which verses should be selected, e. g. "Jesus" :return: the text values of the selected rows as a concatenated string :rtype: str """ bible_df = dataloader.get_df_bible() character_df = bible_df[bible_df["text"].str.contains(character_name)] character_verses = dict(zip( character_df.index, character_df.text)) # FIXME in order to not iterate over df - good? character_texts = [] for i, verse in character_verses.items(): split_verse = verse.split( character_name ) # "He was Jesus and Jesus was good" -> ["He was", "and", "was good"] for index, verse_part in enumerate(split_verse[:-1]): context_before = character_name.join( split_verse[:index + 1]) # index=1: "Jesus".join(["He was", "and"]) context_size_before = len(context_before.split(" ")) context_after = character_name.join( split_verse[index + 1:]) # index=0: "Jesus".join(["and", "was good"]) context_size_after = len(context_after.split(" ")) added_verses = 0 while context_size_before < 10: added_verses += 1 if i - added_verses > 0: context_before = (bible_df.loc[i - added_verses, "text"] + " " + context_before) context_size_before = len(context_before.split(" ")) else: break added_verses = 0 while context_size_after < 10: added_verses += 1 if i + added_verses < len(bible_df.index): context_after += " " + bible_df.loc[i + added_verses, "text"] context_size_after = len(context_after.split(" ")) else: break if context_size_before > 10: context_before = " ".join(context_before.split(" ")[:10]) if context_size_after > 10: context_after = " ".join(context_after.split(" ")[:10]) whole_context = context_before + " " + character_name + " " + context_after character_texts.append(whole_context) text = " ".join(character_texts) return text