Beispiel #1
0
def compare_word_association_in_corpus_dunning(
        word1,
        word2,
        corpus,
        to_pickle=False,
        pickle_filename='dunning_vs_associated_words.pgz'):
    """
    Uses Dunning analysis to compare the words associated with word1
    vs those associated with word2 in the given corpus.

    :param word1: str
    :param word2: str
    :param corpus: Corpus object
    :param to_pickle: boolean; True if you wish to save the results as a Pickle file
    :param pickle_filename: str or Path object;
                            Only used if the pickle already exists
                            or you wish to write a new pickle file
    :return: Dictionary mapping words to dunning scores

    """
    corpus_name = corpus.name if corpus.name else 'corpus'

    try:
        results = load_pickle(pickle_filename)
    except IOError:
        try:
            pickle_filename = f'dunning_{word2}_vs_{word1}_associated_words_{corpus_name}'
            results = load_pickle(pickle_filename)
        except IOError:
            word1_counter = Counter()
            word2_counter = Counter()
            for doc in corpus.documents:
                if isinstance(word1, str):
                    word1_counter.update(doc.words_associated(word1))
                else:  # word1 is a list of strings
                    for word in word1:
                        word1_counter.update(doc.words_associated(word))

                if isinstance(word2, str):
                    word2_counter.update(doc.words_associated(word2))
                else:  # word2 is a list of strings
                    for word in word2:
                        word2_counter.update(doc.words_associated(word))

            if to_pickle:
                results = dunning_total(word1_counter,
                                        word2_counter,
                                        pickle_filepath=pickle_filename)
            else:
                results = dunning_total(word1_counter, word2_counter)

    for group in [None, 'verbs', 'adjectives', 'pronouns', 'adverbs']:
        dunning_result_displayer(results,
                                 number_of_terms_to_display=50,
                                 part_of_speech_to_include=group)

    return results
def store_raw_results(results, pickle_filepath='pronoun_adj_raw_analysis.pgz'):
    """
    Saves the results from run_adj_analysis to a pickle file.

    :param results: dictionary of results from run_adj_analysis
    :param pickle_filepath: filepath to save the output
    :return: None, saves results as pickled file with name 'pronoun_adj_raw_analysis'

    """
    try:
        load_pickle(pickle_filepath)
        user_inp = input(
            "results already stored. overwrite previous analysis? (y/n)")
        if user_inp == 'y':
            store_pickle(results, pickle_filepath)
        else:
            pass
    except IOError:
        store_pickle(results, pickle_filepath)
Beispiel #3
0
    def _load_documents_and_metadata(self,
                                     path_to_files,
                                     csv_path,
                                     ignore_warnings=False):
        """
        Loads documents into the corpus with metadata from a csv file given at initialization.
        """
        # pylint: disable=too-many-locals

        # load pickle if provided
        if path_to_files.suffix == '.pgz':
            pickle_data = common.load_pickle(path_to_files)
            return pickle_data.documents, pickle_data.metadata_fields

        # load documents without metadata csv
        elif path_to_files.suffix == '' and not csv_path:
            files = os.listdir(path_to_files)
            metadata_fields = ['filename', 'filepath']
            ignored = []
            documents = []
            for filename in files:
                if filename.endswith('.txt'):
                    metadata_dict = {
                        'filename': filename,
                        'filepath': path_to_files / filename
                    }
                    documents.append(Document(metadata_dict))
                elif filename.endswith('.csv'):
                    continue  # let's ignore csv files, they're probably metadata
                else:
                    ignored.append(filename)

            if len(documents) == 0:  # path led to directory with no .txt files
                raise ValueError(
                    'path_to_files must lead to a previously pickled corpus '
                    'or directory of .txt files')

            if ignored:
                print(
                    'WARNING: ' +
                    'the following files were not loaded because they are not .txt files.\n'
                    + str(ignored) + '\n' +
                    'If you would like to analyze the text in these files, ' +
                    'convert these files to .txt and create a new Corpus.')

            return documents, metadata_fields

        # load documents based on the metadata csv
        elif csv_path and path_to_files.suffix == '':
            documents = []
            metadata = set()

            try:
                csv_list = load_csv_to_list(csv_path)
            except FileNotFoundError as err:
                raise FileNotFoundError(
                    'Could not find the metadata csv file for the ' +
                    f"'{self.name}' corpus in the expected location " +
                    f'({csv_path}).') from err
            csv_reader = csv.DictReader(csv_list)

            loaded_document_filenames = []
            for document_metadata in csv_reader:
                filename = document_metadata['filename']
                document_metadata['name'] = self.name
                document_metadata['filepath'] = path_to_files / filename
                this_document = Document(document_metadata)
                documents.append(this_document)
                loaded_document_filenames.append(filename)
                metadata.update(list(document_metadata))

            all_txt_files = [
                f for f in os.listdir(path_to_files) if f.endswith('.txt')
            ]
            num_loaded = len(documents)
            num_txt_files = len(all_txt_files)
            if not ignore_warnings and num_loaded != num_txt_files:
                # some txt files aren't in the metadata, so issue a warning
                # we don't need to handle the inverse case, because that
                # will have broken the document init above
                print(
                    'WARNING: The following .txt files were not loaded because they '
                    + 'are not your metadata csv:\n' + str(
                        list(
                            set(all_txt_files) -
                            set(loaded_document_filenames))) +
                    '\nYou may want to check that your metadata matches your files '
                    + 'to avoid incorrect results.')

            return sorted(documents), list(metadata)

        else:
            raise ValueError(
                'path_to_files must lead to a previously pickled corpus or directory of .txt files'
            )