Ejemplo n.º 1
0
def subject_pronouns_gender_comparison(corp, subject_gender, pickle_filepath_male=None, pickle_filepath_female=None):
    """
    Takes in a Corpus of novels and a gender.
    The gender determines whether the male frequency or female frequency will be returned.

    Returns a dictionary of each novel in the Corpus mapped to the portion of the subject
    pronouns in the book that are of the specified gender.

    :param corp: Corpus object
    :param subject_gender: string 'male' or string 'female'
    :param pickle_filepath_male: Location to store results for male results; will not write a file if None
    :param pickle_filepath_female: Location to store results for female results; will not write a file if None
    :return: dictionary

    >>> from gender_analysis.corpus import Corpus
    >>> from gender_analysis.common import TEST_DATA_PATH
    >>> filepath = TEST_DATA_PATH / 'test_corpus'
    >>> csvpath = TEST_DATA_PATH / 'test_corpus' / 'test_corpus.csv'
    >>> subject_pronouns_gender_comparison(Corpus(filepath, csv_path=csvpath), 'male')
    {<Document (aanrud_longfrock)>: 0.25724637681159424, <Document (abbott_flatlandromance)>: 0.9051094890510949, <Document (abbott_indiscreetletter)>: 0.5842696629213483, <Document (adams_fighting)>: 0.8206796818510484, <Document (alcott_josboys)>: 0.5742904841402336, <Document (alcott_littlemen)>: 0.6829615567157096, <Document (alcott_littlewomen)>: 0.3974087784241142, <Document (alden_chautauqua)>: 0.2549295774647887, <Document (austen_emma)>: 0.43709109209864117, <Document (austen_persuasion)>: 0.45726495726495725}
    >>> subject_pronouns_gender_comparison(Corpus(filepath, csv_path=csvpath), 'female')
    {<Document (aanrud_longfrock)>: 0.7427536231884058, <Document (abbott_flatlandromance)>: 0.0948905109489051, <Document (abbott_indiscreetletter)>: 0.4157303370786517, <Document (adams_fighting)>: 0.17932031814895155, <Document (alcott_josboys)>: 0.42570951585976624, <Document (alcott_littlemen)>: 0.3170384432842905, <Document (alcott_littlewomen)>: 0.6025912215758857, <Document (alden_chautauqua)>: 0.7450704225352113, <Document (austen_emma)>: 0.5629089079013588, <Document (austen_persuasion)>: 0.5427350427350427}

    """

    if not(subject_gender == 'male' or subject_gender == 'female'):
        raise ValueError('subject_gender must be \'male\' or \'female\'')

    try:
        relative_freq_male_subject = common.load_pickle(pickle_filepath_male)
        relative_freq_female_subject = common.load_pickle(pickle_filepath_female)
        if subject_gender == 'male':
            return relative_freq_male_subject
        else:
            return relative_freq_female_subject
    except IOError:
        pass

    relative_freq_female_sub = {}
    relative_freq_male_sub = {}

    for book in corp.documents:
        he = book.get_word_freq('he')
        she = book.get_word_freq('she')

        relative_freq_female_sub[book] = she/(he+she)
        relative_freq_male_sub[book] = he/(he+she)

    if pickle_filepath_male and pickle_filepath_female:
        common.store_pickle(relative_freq_female_sub,
                            pickle_filepath_female)
        common.store_pickle(relative_freq_male_sub, pickle_filepath_male)

    if subject_gender == 'male':
        return relative_freq_male_sub
    elif subject_gender == 'female':
        return relative_freq_female_sub
    else:
        raise ValueError('subject_gender must be \'male\' or \'female\'')
Ejemplo n.º 2
0
def compare_word_association_in_corpus_dunning(
        word1,
        word2,
        corpus,
        to_pickle=False,
        pickle_filename='dunning_vs_associated_words.pgz'):
    """
    Uses Dunning analysis to compare the words associated with word1 vs those associated with word2 in
    the given corpus.

    :param word1: str
    :param word2: str
    :param corpus: Corpus object
    :param to_pickle: boolean; True if you wish to save the results as a Pickle file
    :param pickle_filename: str or Path object; Only used if the pickle already exists or you wish to write a new pickle file
    :return: Dictionary mapping words to dunning scores

    """
    corpus_name = corpus.name if corpus.name else 'corpus'

    try:
        results = load_pickle(pickle_filename)
    except IOError:
        try:
            pickle_filename = f'dunning_{word2}_vs_{word1}_associated_words_{corpus_name}'
            results = load_pickle(pickle_filename)
        except:
            word1_counter = Counter()
            word2_counter = Counter()
            for doc in corpus.documents:
                if isinstance(word1, str):
                    word1_counter.update(doc.words_associated(word1))
                else:  # word1 is a list of strings
                    for word in word1:
                        word1_counter.update(doc.words_associated(word))

                if isinstance(word2, str):
                    word2_counter.update(doc.words_associated(word2))
                else:  # word2 is a list of strings
                    for word in word2:
                        word2_counter.update(doc.words_associated(word))

            if to_pickle:
                results = dunning_total(word1_counter,
                                        word2_counter,
                                        pickle_filepath=pickle_filename)
            else:
                results = dunning_total(word1_counter, word2_counter)

    for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
        dunning_result_displayer(results,
                                 number_of_terms_to_display=50,
                                 part_of_speech_to_include=group)

    return results
Ejemplo n.º 3
0
def store_raw_results(results, pickle_filepath='pronoun_adj_raw_analysis.pgz'):
    """
    Saves the results from run_adj_analysis to a pickle file.

    :param results: dictionary of results from run_adj_analysis
    :param pickle_filepath: filepath to save the output
    :return: None, saves results as pickled file with name 'pronoun_adj_raw_analysis'

    """
    try:
        common.load_pickle(pickle_filepath)
        x = input("results already stored. overwrite previous analysis? (y/n)")
        if x == 'y':
            common.store_pickle(results, pickle_filepath)
        else:
            pass
    except IOError:
        common.store_pickle(results, pickle_filepath)
Ejemplo n.º 4
0
def store_raw_results(results,
                      pickle_filepath='instance_distance_raw_analysis.pgz'):
    """
    Stores results from an analysis as a pickle file.

    :param results: A Python object that can be pickled
    :param pickle_filepath: Destination for pickle file
    :return: None
    """
    try:
        common.load_pickle(pickle_filepath)
        x = input("results already stored. overwrite previous analysis? (y/n)")
        if x == 'y':
            common.store_pickle(results, pickle_filepath)
        else:
            pass
    except IOError:
        common.store_pickle(results, pickle_filepath)
Ejemplo n.º 5
0
def dunning_words_by_author_gender(
        corpus,
        display_results=False,
        to_pickle=False,
        pickle_filename='dunning_male_vs_female_authors.pgz'):
    """
    Tests distinctiveness of shared words between male and female authors using dunning analysis.

    If called with display_results=True, prints out the most distinctive terms overall as well as
    grouped by verbs, adjectives etc.
    Returns a dict of all terms in the corpus mapped to the dunning data for each term

    :param corpus: Corpus object
    :param display_results: Boolean; reports a visualization of the results if True
    :param to_pickle: Boolean; Will save the results to a pickle file if True
    :param pickle_filename: Path to pickle object; will try to search for results in this location or write pickle file to path if to_pickle is true.
    :return: dict

    """

    if 'author_gender' not in corpus.metadata_fields:
        raise MissingMetadataError(['author_gender'])

    # By default, try to load precomputed results. Only calculate if no stored results are
    # available.
    try:
        results = load_pickle(pickle_filename)
    except IOError:

        m_corpus = corpus.filter_by_gender('male')
        f_corpus = corpus.filter_by_gender('female')
        wordcounter_male = m_corpus.get_wordcount_counter()
        wordcounter_female = f_corpus.get_wordcount_counter()
        if to_pickle:
            results = dunning_total(wordcounter_female,
                                    wordcounter_male,
                                    filename_to_pickle=pickle_filename)
        else:
            results = dunning_total(wordcounter_female, wordcounter_male)

    if display_results:
        for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
            dunning_result_displayer(results,
                                     number_of_terms_to_display=20,
                                     corpus1_display_name='Fem Author',
                                     corpus2_display_name='Male Author',
                                     part_of_speech_to_include=group)
    return results
Ejemplo n.º 6
0
    def _load_documents_and_metadata(self, path_to_files, csv_path):
        """
        Loads documents into the corpus with metadata from a csv file given at initialization.
        """

        # load pickle if provided
        if path_to_files.suffix == '.pgz':
            pickle_data = common.load_pickle(path_to_files)
            return pickle_data.documents, pickle_data.metadata_fields

        # load documents without metadata csv
        elif path_to_files.suffix == '' and not csv_path:
            files = os.listdir(path_to_files)
            metadata_fields = ['filename', 'filepath']
            ignored = []
            documents = []
            for filename in files:
                if filename.endswith('.txt'):
                    metadata_dict = {
                        'filename': filename,
                        'filepath': path_to_files / filename
                    }
                    documents.append(Document(metadata_dict))
                elif filename.endswith('.csv'):
                    continue  # let's ignore csv files, they're probably metadata
                else:
                    ignored.append(filename)

            if len(documents) == 0:  # path led to directory with no .txt files
                raise ValueError(
                    f'path_to_files must lead to a previously pickled corpus or directory of .txt files'
                )
            elif ignored:
                print(
                    'WARNING: the following files were not loaded because they are not .txt files.\n'
                    + str(ignored) + '\n' +
                    'If you would like to analyze the text in these files, convert these files to '
                    + '.txt and create a new Corpus.')

            return documents, metadata_fields

        # load documents based on the metadata csv
        elif csv_path and path_to_files.suffix == '':
            documents = []
            metadata = set()

            try:
                csv_list = load_csv_to_list(csv_path)
            except FileNotFoundError:
                err = ("Could not find the metadata csv file for the " +
                       f"'{self.name}' corpus in the expected location " +
                       f"({csv_path}).")
                raise FileNotFoundError(err)
            csv_reader = csv.DictReader(csv_list)

            loaded_document_filenames = []
            for document_metadata in csv_reader:
                filename = document_metadata['filename']
                document_metadata['name'] = self.name
                document_metadata['filepath'] = path_to_files / filename
                this_document = Document(document_metadata)
                documents.append(this_document)
                loaded_document_filenames.append(filename)
                metadata.update(list(document_metadata))

            all_txt_files = [
                f for f in os.listdir(path_to_files) if f.endswith('.txt')
            ]
            num_loaded = len(documents)
            num_txt_files = len(all_txt_files)
            if num_loaded != num_txt_files:
                # some txt files aren't in the metadata, so issue a warning
                # we don't need to handle the inverse case, because that
                # will have broken the document init above
                print(
                    f'WARNING: The following .txt files were not loaded because they '
                    + 'are not your metadata csv:\n' + str(
                        list(
                            set(all_txt_files) -
                            set(loaded_document_filenames))) +
                    '\nYou may want to check that your metadata matches your files '
                    + 'to avoid incorrect results.')

            return sorted(documents), list(metadata)

        else:
            raise ValueError(
                f'path_to_files must lead to a previously pickled corpus or directory of .txt files'
            )