Beispiel #1
0
 def filename(self, filepath):
     filename = self.importer.file_name(cache.get_file(filepath))
     # The current implementation of the archive command does not
     # modify the filename returned by the importer. This preserves
     # backward compatibility with the old implementation of the
     # command.
     return misc_utils.idify(filename) if filename else None
Beispiel #2
0
def filepath(importer, filepath: str) -> str:
    """Compute filing path for a document.

    The path mirrors the structure of the accounts associated to the
    documents and with a file name composed by the document date and
    document name returned by the importer.

    Args:
      importer: The importer instance to handle the document.
      filepath: Filesystem path to the document.

    Returns:
      Filing tree location where to store the document.

    Raises:
      beangulp.exceptions.Error: The canonical file name returned by
      the importer contains a path separator charachter or seems to
      contain a date.

    """
    file = cache.get_file(filepath)

    # Get the account corresponding to the file.
    account = importer.file_account(file)
    filename = importer.file_name(file) or os.path.basename(file.name)
    date = importer.file_date(file) or utils.getmdate(file.name)

    # The returned filename cannot contain the file path separator character.
    if os.sep in filename:
        raise Error("The filename contains path separator character.")

    if re.match(r'\d\d\d\d-\d\d-\d\d', filename):
        raise Error("The filename contains what looks like a date.")

    # Remove whitespace and other funny characters from the filename.
    # TODO(dnicolodi): This should probably be importer responsibility.
    filename = misc_utils.idify(filename)

    # Prepend account directory and date prefix.
    filename = os.path.join(account.replace(':', os.sep),
                            f'{date:%Y-%m-%d}.{filename:}')

    return filename
Beispiel #3
0
def file_one_file(filename, importers, destination, idify=False, logfile=None):
    """Move a single filename using its matched importers.

    Args:
      filename: A string, the name of the downloaded file to be processed.
      importers: A list of importer instances that handle this file.
      destination: A string, the root destination directory where the files are
        to be filed. The files are organized there under a hierarchy mirroring
        that of the chart of accounts.
      idify: A flag, if true, remove whitespace and funky characters in the destination
        filename.
      logfile: A file object to write log entries to, or None, in which case no log is
        written out.
    Returns:
      The full new destination filename on success, and None if there was an error.
    """
    # Create an object to cache all the conversions between the importers
    # and phases and what-not.
    file = cache.get_file(filename)

    # Get the account corresponding to the file.
    file_accounts = []
    for index, importer in enumerate(importers):
        try:
            account_ = importer.file_account(file)
        except Exception as exc:
            account_ = None
            logging.exception("Importer %s.file_account() raised an unexpected error: %s",
                              importer.name(), exc)
        if account_ is not None:
            file_accounts.append(account_)

    file_accounts_set = set(file_accounts)
    if not file_accounts_set:
        logging.error("No account provided by importers: {}".format(
            ", ".join(imp.name() for imp in importers)))
        return None

    if len(file_accounts_set) > 1:
        logging.warning("Ambiguous accounts from many importers: {}".format(
            ', '.join(file_accounts_set)))
        # Note: Don't exit; select the first matching importer's account.

    file_account = file_accounts.pop(0)

    # Given multiple importers, select the first one that was yielded to
    # obtain the date and process the filename.
    importer = importers[0]

    # Compute the date from the last modified time.
    mtime = path.getmtime(filename)
    mtime_date = datetime.datetime.fromtimestamp(mtime).date()

    # Try to get the file's date by calling a module support function. The
    # module may be able to extract the date from the filename, from the
    # contents of the file itself (e.g. scraping some text from the PDF
    # contents, or grabbing the last line of a CSV file).
    try:
        date = importer.file_date(file)
    except Exception as exc:
        logging.exception("Importer %s.file_date() raised an unexpected error: %s",
                          importer.name(), exc)
        date = None
    if date is None:
        # Fallback on the last modified time of the file.
        date = mtime_date
        date_source = 'mtime'
    else:
        date_source = 'contents'

    # Apply filename renaming, if implemented.
    # Otherwise clean up the filename.
    try:
        clean_filename = importer.file_name(file)

        # Warn the importer implementor if a name is returned and it's an
        # absolute filename.
        if clean_filename and (path.isabs(clean_filename) or os.sep in clean_filename):
            logging.error(("The importer '%s' file_name() method should return a relative "
                           "filename; the filename '%s' is absolute or contains path "
                           "separators"),
                          importer.name(), clean_filename)
    except Exception as exc:
        logging.exception("Importer %s.file_name() raised an unexpected error: %s",
                          importer.name(), exc)
        clean_filename = None
    if clean_filename is None:
        # If no filename has been provided, use the basename.
        clean_filename = path.basename(file.name)
    elif re.match(r'\d\d\d\d-\d\d-\d\d', clean_filename):
        logging.error("The importer '%s' file_name() method should not date the "
                      "returned filename. Implement file_date() instead.")

    # We need a simple filename; remove the directory part if there is one.
    clean_basename = path.basename(clean_filename)

    # Remove whitespace if requested.
    if idify:
        clean_basename = misc_utils.idify(clean_basename)

    # Prepend the date prefix.
    new_filename = '{0:%Y-%m-%d}.{1}'.format(date, clean_basename)

    # Prepend destination directory.
    new_fullname = path.normpath(path.join(destination,
                                           file_account.replace(account.sep, os.sep),
                                           new_filename))

    # Print the filename and which modules matched.
    if logfile is not None:
        logfile.write('Importer:    {}\n'.format(importer.name() if importer else '-'))
        logfile.write('Account:     {}\n'.format(file_account))
        logfile.write('Date:        {} (from {})\n'.format(date, date_source))
        logfile.write('Destination: {}\n'.format(new_fullname))
        logfile.write('\n')

    return new_fullname
Beispiel #4
0
 def test_idify(self):
     self.assertEqual('A_great_movie_for_us.mp4',
                      misc_utils.idify(' A great movie (for us) .mp4 '))
     self.assertEqual('A____B.pdf', misc_utils.idify('A____B_._pdf'))
Beispiel #5
0
 def filename(self, filepath):
     return path.basename(misc_utils.idify(filepath))