def filename(self, filepath): filename = self.importer.file_name(cache.get_file(filepath)) # The current implementation of the archive command does not # modify the filename returned by the importer. This preserves # backward compatibility with the old implementation of the # command. return misc_utils.idify(filename) if filename else None
def filepath(importer, filepath: str) -> str: """Compute filing path for a document. The path mirrors the structure of the accounts associated to the documents and with a file name composed by the document date and document name returned by the importer. Args: importer: The importer instance to handle the document. filepath: Filesystem path to the document. Returns: Filing tree location where to store the document. Raises: beangulp.exceptions.Error: The canonical file name returned by the importer contains a path separator charachter or seems to contain a date. """ file = cache.get_file(filepath) # Get the account corresponding to the file. account = importer.file_account(file) filename = importer.file_name(file) or os.path.basename(file.name) date = importer.file_date(file) or utils.getmdate(file.name) # The returned filename cannot contain the file path separator character. if os.sep in filename: raise Error("The filename contains path separator character.") if re.match(r'\d\d\d\d-\d\d-\d\d', filename): raise Error("The filename contains what looks like a date.") # Remove whitespace and other funny characters from the filename. # TODO(dnicolodi): This should probably be importer responsibility. filename = misc_utils.idify(filename) # Prepend account directory and date prefix. filename = os.path.join(account.replace(':', os.sep), f'{date:%Y-%m-%d}.{filename:}') return filename
def file_one_file(filename, importers, destination, idify=False, logfile=None): """Move a single filename using its matched importers. Args: filename: A string, the name of the downloaded file to be processed. importers: A list of importer instances that handle this file. destination: A string, the root destination directory where the files are to be filed. The files are organized there under a hierarchy mirroring that of the chart of accounts. idify: A flag, if true, remove whitespace and funky characters in the destination filename. logfile: A file object to write log entries to, or None, in which case no log is written out. Returns: The full new destination filename on success, and None if there was an error. """ # Create an object to cache all the conversions between the importers # and phases and what-not. file = cache.get_file(filename) # Get the account corresponding to the file. file_accounts = [] for index, importer in enumerate(importers): try: account_ = importer.file_account(file) except Exception as exc: account_ = None logging.exception("Importer %s.file_account() raised an unexpected error: %s", importer.name(), exc) if account_ is not None: file_accounts.append(account_) file_accounts_set = set(file_accounts) if not file_accounts_set: logging.error("No account provided by importers: {}".format( ", ".join(imp.name() for imp in importers))) return None if len(file_accounts_set) > 1: logging.warning("Ambiguous accounts from many importers: {}".format( ', '.join(file_accounts_set))) # Note: Don't exit; select the first matching importer's account. file_account = file_accounts.pop(0) # Given multiple importers, select the first one that was yielded to # obtain the date and process the filename. importer = importers[0] # Compute the date from the last modified time. mtime = path.getmtime(filename) mtime_date = datetime.datetime.fromtimestamp(mtime).date() # Try to get the file's date by calling a module support function. The # module may be able to extract the date from the filename, from the # contents of the file itself (e.g. scraping some text from the PDF # contents, or grabbing the last line of a CSV file). try: date = importer.file_date(file) except Exception as exc: logging.exception("Importer %s.file_date() raised an unexpected error: %s", importer.name(), exc) date = None if date is None: # Fallback on the last modified time of the file. date = mtime_date date_source = 'mtime' else: date_source = 'contents' # Apply filename renaming, if implemented. # Otherwise clean up the filename. try: clean_filename = importer.file_name(file) # Warn the importer implementor if a name is returned and it's an # absolute filename. if clean_filename and (path.isabs(clean_filename) or os.sep in clean_filename): logging.error(("The importer '%s' file_name() method should return a relative " "filename; the filename '%s' is absolute or contains path " "separators"), importer.name(), clean_filename) except Exception as exc: logging.exception("Importer %s.file_name() raised an unexpected error: %s", importer.name(), exc) clean_filename = None if clean_filename is None: # If no filename has been provided, use the basename. clean_filename = path.basename(file.name) elif re.match(r'\d\d\d\d-\d\d-\d\d', clean_filename): logging.error("The importer '%s' file_name() method should not date the " "returned filename. Implement file_date() instead.") # We need a simple filename; remove the directory part if there is one. clean_basename = path.basename(clean_filename) # Remove whitespace if requested. if idify: clean_basename = misc_utils.idify(clean_basename) # Prepend the date prefix. new_filename = '{0:%Y-%m-%d}.{1}'.format(date, clean_basename) # Prepend destination directory. new_fullname = path.normpath(path.join(destination, file_account.replace(account.sep, os.sep), new_filename)) # Print the filename and which modules matched. if logfile is not None: logfile.write('Importer: {}\n'.format(importer.name() if importer else '-')) logfile.write('Account: {}\n'.format(file_account)) logfile.write('Date: {} (from {})\n'.format(date, date_source)) logfile.write('Destination: {}\n'.format(new_fullname)) logfile.write('\n') return new_fullname
def test_idify(self): self.assertEqual('A_great_movie_for_us.mp4', misc_utils.idify(' A great movie (for us) .mp4 ')) self.assertEqual('A____B.pdf', misc_utils.idify('A____B_._pdf'))
def filename(self, filepath): return path.basename(misc_utils.idify(filepath))