def subject_pronouns_gender_comparison(corp, subject_gender, pickle_filepath_male=None, pickle_filepath_female=None): """ Takes in a Corpus of novels and a gender. The gender determines whether the male frequency or female frequency will be returned. Returns a dictionary of each novel in the Corpus mapped to the portion of the subject pronouns in the book that are of the specified gender. :param corp: Corpus object :param subject_gender: string 'male' or string 'female' :param pickle_filepath_male: Location to store results for male results; will not write a file if None :param pickle_filepath_female: Location to store results for female results; will not write a file if None :return: dictionary >>> from gender_analysis.corpus import Corpus >>> from gender_analysis.common import TEST_DATA_PATH >>> filepath = TEST_DATA_PATH / 'test_corpus' >>> csvpath = TEST_DATA_PATH / 'test_corpus' / 'test_corpus.csv' >>> subject_pronouns_gender_comparison(Corpus(filepath, csv_path=csvpath), 'male') {<Document (aanrud_longfrock)>: 0.25724637681159424, <Document (abbott_flatlandromance)>: 0.9051094890510949, <Document (abbott_indiscreetletter)>: 0.5842696629213483, <Document (adams_fighting)>: 0.8206796818510484, <Document (alcott_josboys)>: 0.5742904841402336, <Document (alcott_littlemen)>: 0.6829615567157096, <Document (alcott_littlewomen)>: 0.3974087784241142, <Document (alden_chautauqua)>: 0.2549295774647887, <Document (austen_emma)>: 0.43709109209864117, <Document (austen_persuasion)>: 0.45726495726495725} >>> subject_pronouns_gender_comparison(Corpus(filepath, csv_path=csvpath), 'female') {<Document (aanrud_longfrock)>: 0.7427536231884058, <Document (abbott_flatlandromance)>: 0.0948905109489051, <Document (abbott_indiscreetletter)>: 0.4157303370786517, <Document (adams_fighting)>: 0.17932031814895155, <Document (alcott_josboys)>: 0.42570951585976624, <Document (alcott_littlemen)>: 0.3170384432842905, <Document (alcott_littlewomen)>: 0.6025912215758857, <Document (alden_chautauqua)>: 0.7450704225352113, <Document (austen_emma)>: 0.5629089079013588, <Document (austen_persuasion)>: 0.5427350427350427} """ if not(subject_gender == 'male' or subject_gender == 'female'): raise ValueError('subject_gender must be \'male\' or \'female\'') try: relative_freq_male_subject = common.load_pickle(pickle_filepath_male) relative_freq_female_subject = common.load_pickle(pickle_filepath_female) if subject_gender == 'male': return relative_freq_male_subject else: return relative_freq_female_subject except IOError: pass relative_freq_female_sub = {} relative_freq_male_sub = {} for book in corp.documents: he = book.get_word_freq('he') she = book.get_word_freq('she') relative_freq_female_sub[book] = she/(he+she) relative_freq_male_sub[book] = he/(he+she) if pickle_filepath_male and pickle_filepath_female: common.store_pickle(relative_freq_female_sub, pickle_filepath_female) common.store_pickle(relative_freq_male_sub, pickle_filepath_male) if subject_gender == 'male': return relative_freq_male_sub elif subject_gender == 'female': return relative_freq_female_sub else: raise ValueError('subject_gender must be \'male\' or \'female\'')
def compare_word_association_in_corpus_dunning( word1, word2, corpus, to_pickle=False, pickle_filename='dunning_vs_associated_words.pgz'): """ Uses Dunning analysis to compare the words associated with word1 vs those associated with word2 in the given corpus. :param word1: str :param word2: str :param corpus: Corpus object :param to_pickle: boolean; True if you wish to save the results as a Pickle file :param pickle_filename: str or Path object; Only used if the pickle already exists or you wish to write a new pickle file :return: Dictionary mapping words to dunning scores """ corpus_name = corpus.name if corpus.name else 'corpus' try: results = load_pickle(pickle_filename) except IOError: try: pickle_filename = f'dunning_{word2}_vs_{word1}_associated_words_{corpus_name}' results = load_pickle(pickle_filename) except: word1_counter = Counter() word2_counter = Counter() for doc in corpus.documents: if isinstance(word1, str): word1_counter.update(doc.words_associated(word1)) else: # word1 is a list of strings for word in word1: word1_counter.update(doc.words_associated(word)) if isinstance(word2, str): word2_counter.update(doc.words_associated(word2)) else: # word2 is a list of strings for word in word2: word2_counter.update(doc.words_associated(word)) if to_pickle: results = dunning_total(word1_counter, word2_counter, pickle_filepath=pickle_filename) else: results = dunning_total(word1_counter, word2_counter) for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=50, part_of_speech_to_include=group) return results
def store_raw_results(results, pickle_filepath='pronoun_adj_raw_analysis.pgz'): """ Saves the results from run_adj_analysis to a pickle file. :param results: dictionary of results from run_adj_analysis :param pickle_filepath: filepath to save the output :return: None, saves results as pickled file with name 'pronoun_adj_raw_analysis' """ try: common.load_pickle(pickle_filepath) x = input("results already stored. overwrite previous analysis? (y/n)") if x == 'y': common.store_pickle(results, pickle_filepath) else: pass except IOError: common.store_pickle(results, pickle_filepath)
def store_raw_results(results, pickle_filepath='instance_distance_raw_analysis.pgz'): """ Stores results from an analysis as a pickle file. :param results: A Python object that can be pickled :param pickle_filepath: Destination for pickle file :return: None """ try: common.load_pickle(pickle_filepath) x = input("results already stored. overwrite previous analysis? (y/n)") if x == 'y': common.store_pickle(results, pickle_filepath) else: pass except IOError: common.store_pickle(results, pickle_filepath)
def dunning_words_by_author_gender( corpus, display_results=False, to_pickle=False, pickle_filename='dunning_male_vs_female_authors.pgz'): """ Tests distinctiveness of shared words between male and female authors using dunning analysis. If called with display_results=True, prints out the most distinctive terms overall as well as grouped by verbs, adjectives etc. Returns a dict of all terms in the corpus mapped to the dunning data for each term :param corpus: Corpus object :param display_results: Boolean; reports a visualization of the results if True :param to_pickle: Boolean; Will save the results to a pickle file if True :param pickle_filename: Path to pickle object; will try to search for results in this location or write pickle file to path if to_pickle is true. :return: dict """ if 'author_gender' not in corpus.metadata_fields: raise MissingMetadataError(['author_gender']) # By default, try to load precomputed results. Only calculate if no stored results are # available. try: results = load_pickle(pickle_filename) except IOError: m_corpus = corpus.filter_by_gender('male') f_corpus = corpus.filter_by_gender('female') wordcounter_male = m_corpus.get_wordcount_counter() wordcounter_female = f_corpus.get_wordcount_counter() if to_pickle: results = dunning_total(wordcounter_female, wordcounter_male, filename_to_pickle=pickle_filename) else: results = dunning_total(wordcounter_female, wordcounter_male) if display_results: for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']: dunning_result_displayer(results, number_of_terms_to_display=20, corpus1_display_name='Fem Author', corpus2_display_name='Male Author', part_of_speech_to_include=group) return results
def _load_documents_and_metadata(self, path_to_files, csv_path): """ Loads documents into the corpus with metadata from a csv file given at initialization. """ # load pickle if provided if path_to_files.suffix == '.pgz': pickle_data = common.load_pickle(path_to_files) return pickle_data.documents, pickle_data.metadata_fields # load documents without metadata csv elif path_to_files.suffix == '' and not csv_path: files = os.listdir(path_to_files) metadata_fields = ['filename', 'filepath'] ignored = [] documents = [] for filename in files: if filename.endswith('.txt'): metadata_dict = { 'filename': filename, 'filepath': path_to_files / filename } documents.append(Document(metadata_dict)) elif filename.endswith('.csv'): continue # let's ignore csv files, they're probably metadata else: ignored.append(filename) if len(documents) == 0: # path led to directory with no .txt files raise ValueError( f'path_to_files must lead to a previously pickled corpus or directory of .txt files' ) elif ignored: print( 'WARNING: the following files were not loaded because they are not .txt files.\n' + str(ignored) + '\n' + 'If you would like to analyze the text in these files, convert these files to ' + '.txt and create a new Corpus.') return documents, metadata_fields # load documents based on the metadata csv elif csv_path and path_to_files.suffix == '': documents = [] metadata = set() try: csv_list = load_csv_to_list(csv_path) except FileNotFoundError: err = ("Could not find the metadata csv file for the " + f"'{self.name}' corpus in the expected location " + f"({csv_path}).") raise FileNotFoundError(err) csv_reader = csv.DictReader(csv_list) loaded_document_filenames = [] for document_metadata in csv_reader: filename = document_metadata['filename'] document_metadata['name'] = self.name document_metadata['filepath'] = path_to_files / filename this_document = Document(document_metadata) documents.append(this_document) loaded_document_filenames.append(filename) metadata.update(list(document_metadata)) all_txt_files = [ f for f in os.listdir(path_to_files) if f.endswith('.txt') ] num_loaded = len(documents) num_txt_files = len(all_txt_files) if num_loaded != num_txt_files: # some txt files aren't in the metadata, so issue a warning # we don't need to handle the inverse case, because that # will have broken the document init above print( f'WARNING: The following .txt files were not loaded because they ' + 'are not your metadata csv:\n' + str( list( set(all_txt_files) - set(loaded_document_filenames))) + '\nYou may want to check that your metadata matches your files ' + 'to avoid incorrect results.') return sorted(documents), list(metadata) else: raise ValueError( f'path_to_files must lead to a previously pickled corpus or directory of .txt files' )