def update_metadata(self, new_metadata_path): """ Takes a filepath to a csv with new metadata and updates the metadata in the corpus' documents accordingly. The new file does not need to contain every metadata field in the documents - only the fields that you wish to update. NOTE: The csv file must include at least a filename for the documents that will be altered. :param new_metadata_path: Path to new metadata csv file :return: None """ metadata = set() metadata.update(self.metadata_fields) if isinstance(new_metadata_path, str): new_metadata_path = Path(new_metadata_path) if not isinstance(new_metadata_path, Path): raise ValueError( f'new_metadata_path must be str or Path object, not type {type(new_metadata_path)}' ) try: csv_list = load_csv_to_list(new_metadata_path) except FileNotFoundError: err = "Could not find the metadata csv file for the " err += f"corpus in the expected location ({self.csv_path})." raise FileNotFoundError(err) csv_reader = csv.DictReader(csv_list) for document_metadata in csv_reader: document_metadata = dict(document_metadata) metadata.update(list(document_metadata)) try: document = self.get_document('filename', document_metadata['filename']) except ValueError: raise ValueError( f"Document {document_metadata['filename']} not found in corpus" ) document.update_metadata(document_metadata) self.metadata_fields = list(metadata)
def _load_documents_and_metadata(self, path_to_files, csv_path): """ Loads documents into the corpus with metadata from a csv file given at initialization. """ # load pickle if provided if path_to_files.suffix == '.pgz': pickle_data = common.load_pickle(path_to_files) return pickle_data.documents, pickle_data.metadata_fields # load documents without metadata csv elif path_to_files.suffix == '' and not csv_path: files = os.listdir(path_to_files) metadata_fields = ['filename', 'filepath'] ignored = [] documents = [] for filename in files: if filename.endswith('.txt'): metadata_dict = { 'filename': filename, 'filepath': path_to_files / filename } documents.append(Document(metadata_dict)) elif filename.endswith('.csv'): continue # let's ignore csv files, they're probably metadata else: ignored.append(filename) if len(documents) == 0: # path led to directory with no .txt files raise ValueError( f'path_to_files must lead to a previously pickled corpus or directory of .txt files' ) elif ignored: print( 'WARNING: the following files were not loaded because they are not .txt files.\n' + str(ignored) + '\n' + 'If you would like to analyze the text in these files, convert these files to ' + '.txt and create a new Corpus.') return documents, metadata_fields # load documents based on the metadata csv elif csv_path and path_to_files.suffix == '': documents = [] metadata = set() try: csv_list = load_csv_to_list(csv_path) except FileNotFoundError: err = ("Could not find the metadata csv file for the " + f"'{self.name}' corpus in the expected location " + f"({csv_path}).") raise FileNotFoundError(err) csv_reader = csv.DictReader(csv_list) loaded_document_filenames = [] for document_metadata in csv_reader: filename = document_metadata['filename'] document_metadata['name'] = self.name document_metadata['filepath'] = path_to_files / filename this_document = Document(document_metadata) documents.append(this_document) loaded_document_filenames.append(filename) metadata.update(list(document_metadata)) all_txt_files = [ f for f in os.listdir(path_to_files) if f.endswith('.txt') ] num_loaded = len(documents) num_txt_files = len(all_txt_files) if num_loaded != num_txt_files: # some txt files aren't in the metadata, so issue a warning # we don't need to handle the inverse case, because that # will have broken the document init above print( f'WARNING: The following .txt files were not loaded because they ' + 'are not your metadata csv:\n' + str( list( set(all_txt_files) - set(loaded_document_filenames))) + '\nYou may want to check that your metadata matches your files ' + 'to avoid incorrect results.') return sorted(documents), list(metadata) else: raise ValueError( f'path_to_files must lead to a previously pickled corpus or directory of .txt files' )