Example #1
0
    def update_metadata(self, new_metadata_path):
        """
        Takes a filepath to a csv with new metadata and updates the metadata in the corpus'
        documents accordingly. The new file does not need to contain every metadata field in
        the documents - only the fields that you wish to update.

        NOTE: The csv file must include at least a filename for the documents that will be altered.

        :param new_metadata_path: Path to new metadata csv file
        :return: None
        """
        metadata = set()
        metadata.update(self.metadata_fields)

        if isinstance(new_metadata_path, str):
            new_metadata_path = Path(new_metadata_path)
        if not isinstance(new_metadata_path, Path):
            raise ValueError(
                f'new_metadata_path must be str or Path object, not type {type(new_metadata_path)}'
            )

        try:
            csv_list = load_csv_to_list(new_metadata_path)
        except FileNotFoundError:
            err = "Could not find the metadata csv file for the "
            err += f"corpus in the expected location ({self.csv_path})."
            raise FileNotFoundError(err)
        csv_reader = csv.DictReader(csv_list)

        for document_metadata in csv_reader:
            document_metadata = dict(document_metadata)
            metadata.update(list(document_metadata))
            try:
                document = self.get_document('filename',
                                             document_metadata['filename'])
            except ValueError:
                raise ValueError(
                    f"Document {document_metadata['filename']} not found in corpus"
                )

            document.update_metadata(document_metadata)

        self.metadata_fields = list(metadata)
Example #2
0
    def _load_documents_and_metadata(self, path_to_files, csv_path):
        """
        Loads documents into the corpus with metadata from a csv file given at initialization.
        """

        # load pickle if provided
        if path_to_files.suffix == '.pgz':
            pickle_data = common.load_pickle(path_to_files)
            return pickle_data.documents, pickle_data.metadata_fields

        # load documents without metadata csv
        elif path_to_files.suffix == '' and not csv_path:
            files = os.listdir(path_to_files)
            metadata_fields = ['filename', 'filepath']
            ignored = []
            documents = []
            for filename in files:
                if filename.endswith('.txt'):
                    metadata_dict = {
                        'filename': filename,
                        'filepath': path_to_files / filename
                    }
                    documents.append(Document(metadata_dict))
                elif filename.endswith('.csv'):
                    continue  # let's ignore csv files, they're probably metadata
                else:
                    ignored.append(filename)

            if len(documents) == 0:  # path led to directory with no .txt files
                raise ValueError(
                    f'path_to_files must lead to a previously pickled corpus or directory of .txt files'
                )
            elif ignored:
                print(
                    'WARNING: the following files were not loaded because they are not .txt files.\n'
                    + str(ignored) + '\n' +
                    'If you would like to analyze the text in these files, convert these files to '
                    + '.txt and create a new Corpus.')

            return documents, metadata_fields

        # load documents based on the metadata csv
        elif csv_path and path_to_files.suffix == '':
            documents = []
            metadata = set()

            try:
                csv_list = load_csv_to_list(csv_path)
            except FileNotFoundError:
                err = ("Could not find the metadata csv file for the " +
                       f"'{self.name}' corpus in the expected location " +
                       f"({csv_path}).")
                raise FileNotFoundError(err)
            csv_reader = csv.DictReader(csv_list)

            loaded_document_filenames = []
            for document_metadata in csv_reader:
                filename = document_metadata['filename']
                document_metadata['name'] = self.name
                document_metadata['filepath'] = path_to_files / filename
                this_document = Document(document_metadata)
                documents.append(this_document)
                loaded_document_filenames.append(filename)
                metadata.update(list(document_metadata))

            all_txt_files = [
                f for f in os.listdir(path_to_files) if f.endswith('.txt')
            ]
            num_loaded = len(documents)
            num_txt_files = len(all_txt_files)
            if num_loaded != num_txt_files:
                # some txt files aren't in the metadata, so issue a warning
                # we don't need to handle the inverse case, because that
                # will have broken the document init above
                print(
                    f'WARNING: The following .txt files were not loaded because they '
                    + 'are not your metadata csv:\n' + str(
                        list(
                            set(all_txt_files) -
                            set(loaded_document_filenames))) +
                    '\nYou may want to check that your metadata matches your files '
                    + 'to avoid incorrect results.')

            return sorted(documents), list(metadata)

        else:
            raise ValueError(
                f'path_to_files must lead to a previously pickled corpus or directory of .txt files'
            )