Example #1
0
    def __init__(self, source_file, test_parse=False, source_directory=None, working_directory=None, testing=False):
        """
            source directory and working directory are generally for test
            purposes
        """

        if not testing:
            #   If this isn't specified we're testing so don't need them
            assert Config.logger
            assert Config.config

        dtpo_log("debug", "TextExtractor -> %s", source_file)

        if source_directory is None:
            source_directory = Config.config.get_source_directory()
        if working_directory is None:
            working_directory = Config.config.get_working_directory()

        self.source_file = source_directory + "/" + source_file
        self.text_file = working_directory + "/" + source_file + ".txt"
        self.file_array = []
        self.status = False

        self.file_type, self.mime_type = get_file_type(self.source_file)
        if str(self.file_type) == "k.PDF_Document":
            self.parse_pdf(test_parse)
        else:
            error_message = "TextExtractor - Invalid File Type for {0}".format(self.source_file)
            dtpo_log("error", error_message)
            raise ValueError(error_message)
Example #2
0
def get_file_type(source_file):
    """
        Check the source file and determine its type
        TODO Implement other types
    """
    dtpo_log("info", "get_file_type for %s - needs fully implementing", source_file)

    return k.PDF_Document, "application/pdf"
Example #3
0
    def parse_pdf(self, test_parse=False):
        """
            Parse a PDF and return text contents as an array
        """

        dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file)

        # input options
        pagenos = set()
        maxpages = 0
        # output option
        codec = "utf-8"
        caching = True
        laparams = LAParams()
        laparams.char_margin = 8.0
        laparams.word_margin = 2.0

        rsrcmgr = PDFResourceManager(caching=caching)

        try:
            outfp = file(self.text_file, "w")
        except IOError as io_error:
            raise DTPOFileError(self.text_file, 0, str(io_error))

        try:
            fp = file(self.source_file, "rb")
        except IOError as io_error:
            raise DTPOFileError(self.source_file, 0, str(io_error))

        try:
            device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
            process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True)

        except PDFException as pdf_error:
            message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error))
            raise DTPOFileError(self.source_file, 0, message)
        except Exception as exception:
            message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception))
            raise DTPOFileError(self.source_file, 0, message)

        fp.close()
        device.close()
        outfp.close()

        #   Got the PDF converted = now get it into an array
        self.file_array = []
        for line in open(self.text_file):
            self.file_array.append(line)

        #   Remove the last entry - it's always '\x0c'
        if len(self.file_array) > 0:
            del self.file_array[-1]

        #   Remove the outfile
        if not test_parse:
            os.remove(self.text_file)
Example #4
0
    def parse_pattern_file(self, config_file) :
        """
        Parse the file extracting the relevant details
        """
        #
        #	Now read through the file and set the parameters
        #
        current_file_pattern = None

        dtpo_log('debug', "Parsing pattern file -> '%s'", config_file)
        line_number = 0
        try :
            for line in open(config_file) :

                line_number = line_number + 1
                key, value = parse_line(line, line_number)

                # if we found something then process it
                if (key or value) :
                    current_file_pattern = self.process_pattern_value(
                            current_file_pattern,
                            key,
                            value)

            #   check that the defaults are there - do this first in case the
            #   file is corrupt - that way we fail gracefully
            for key in self.pattern_keys :
                if self.pattern_keys[key]['type'] == 'default' and \
                    not self.pattern_keys[key]['optional'] and \
                    eval("self.{0} is None".format(
                        self.pattern_keys[key]['variable'])) :
                    raise DTPOFileError(config_file, line_number,
                        "Missing default -> '{0}'".format(key))

            #   Validate that the last record is good & then add it to the list
            self.check_file_pattern_complete(current_file_pattern)
            self.file_pattern_list.append(current_file_pattern)


        except ParseError as parse_exception :
            raise DTPOFileError(
                config_file, line_number, parse_exception.message)

        except IOError as io_exception :
            #	Failed to access the config file
            raise DTPOFileError (config_file, 0,
                "Error accessing config file -> '{0}'" \
                .format(str(io_exception)))
Example #5
0
    def __init__(self, config_file) :
        """
            read through the file and set the parameters
        """
        dtpo_log('debug', 'DTPOParseSpec. Source File -> %s', config_file)

        self.default_database = None
        self.default_group = None
        self.default_tag = None
        self.file_pattern_list = []
        self.string1_search_dict = {}
        self.string2_search_dict = {}
        self.date_search_dict = {}

        self.parse_pattern_file(config_file)

        self.create_reference_lists()
Example #6
0
def get_import_parameters(source_file, pattern_spec, test_parse) :
    """
        Imports the specified file into DTPO using the spec given
    """

    dtpo_log('debug', "get_import_parameters source_file -> %s", source_file)

    #
    #   parse the file and turn it into a list
    #
    file_parser = TextExtractor(source_file, test_parse = test_parse)

    #   Parse the file and then do the import
    dtpo_import_parameters = parse_source_file(file_parser, pattern_spec)
    dtpo_import_parameters.file_type = file_parser.file_type
    dtpo_import_parameters.mime_type = file_parser.mime_type

    return dtpo_import_parameters
Example #7
0
def parse_line(line, line_number) :
    """
    Parse a line to extract a key and value and return a key, value tuple
    """

    dtpo_log('debug', "%04d -> '%s'", line_number, line)

    #   Want lines that dont start with #
    search = re.match('(^[^#]*)::(.[^#]*)', line.lstrip())

    return_value = (False, "")

    # if we found something then
    if (search) :
        key = search.group(1).lstrip()
        value = search.group(2).rstrip()

        dtpo_log('debug', "key -> '%s', value -> '%s'", key, value)

        return_value = (key, value)

    return return_value
Example #8
0
def main() :
    """
    Get the command line arguments
    """
    p = optparse.OptionParser()
    p.add_option("-d", action="store_true", dest="debug")
    p.add_option("--debug", action="store_true", dest="debug")
    p.add_option("--config_file", action="store", dest="config_file")
    p.add_option("--test_parse", action="store_true", dest="test_parse")
    p.set_defaults(debug = False)

    opts, source_file_args = p.parse_args()

    try :
        # Config File is mandatory
        if not opts.config_file :
            raise ParseError("No Config file")
        #
        #    Upload the configs
        #
        Config(opts.config_file)
        pattern_spec = DTPOParseSpec(Config.config.get_pattern_file())
    except DTPOFileError as file_error:
        dtpo_alert(log_type = 'fatal', reason = file_error.message)
        raise SystemExit("FATAL ERROR - Failed to parse config file")
    except ParseError as parse_error :
        dtpo_alert('fatal', reason = parse_error.message)
        raise SystemExit("FATAL ERROR - Failed to parse pattern file")

    #
    #    Now iterate through the files
    #
    for source_file in source_file_args:
        dtpo_log('info', "Started processing -> %s", source_file)

        try :

            #  TODO - we're assuming PDF files here
            #  Check that the file name actually ends in
            #  pdf if not rename it as it will save trouble with DTPO later
            suffix = source_file[-3:]
            if suffix.lower() != 'pdf' :
                dtpo_log('debug', "Adding pdf suffix on to '%s'",
                         source_file)
                source_dir = Config.config.get_source_directory() + '/'
                os.rename(source_dir + source_file,
                          source_dir + source_file + '.pdf')
                source_file += '.pdf'
            #
            #    Convert the file to text if we can and then parse it
            #
            import_details = get_import_parameters(source_file, pattern_spec,
                                                   opts.test_parse)
            if opts.test_parse :
                import_details.print_import_details(source_file)
            else :
                execute_import(import_details)
                trash_file(source_file, import_details.get_document_name())
                dtpo_alert('info',
                           file_name = import_details.get_document_name(),
                           group_name = import_details.group)
        except DTPOFileError as file_error :
            #    We failed ... Leave the file be as there is a problem with it
            dtpo_log('error', "Import failed for '%s' - file not touched\n%s",
                basename(source_file), file_error.message)
            dtpo_alert('fatal', reason = file_error.message,
                       file_name = source_file)

        except ParseError as parse_error :
            #    We failed ... Move the file to the Orphan directory
            dtpo_log('error', "Import failed for '%s' - orphaning file\n%s",
                basename(source_file), parse_error.message)
            dtpo_alert('error', reason = parse_error.message,
                       file_name = source_file)
            orphan_file(source_file)
        except Exception as exception :
            #   Something horrible has happend
            dtpo_log('fatal', "System error for '%s'\n%s",
                     basename(source_file), str(exception))
            dtpo_alert('fatal', reason = str(exception),
                       file_name = source_file)

        dtpo_log('debug', 'Completed Successfully')
Example #9
0
def execute_import(import_parameters) :
    """
        Now run the actual import into DTPO
    """

    assert import_parameters.source_file
    assert import_parameters.file_type
    assert import_parameters.mime_type
    assert import_parameters.group
    assert import_parameters.tags

    source_file = import_parameters.source_file
    database = Config.config.get_database_directory() + '/' + \
        import_parameters.database
    document_name = import_parameters.get_document_name()

    dtpo_log('info', "execute_import source file -> %s", source_file)
    dtpo_log('info', "execute_import database -> %s", database)
    dtpo_log('info', "execute_import group -> %s", import_parameters.group)
    dtpo_log('info', "execute_import tags -> %s", import_parameters.tags)
    dtpo_log('info', "execute_import document name -> %s", document_name)

    try :
        try :
            #   First see if the relevant database is open already
            dtpo_db_id = None
            dt = app(u'DEVONthink Pro')
            for dtpo_db in dt.databases.get() :
                if dtpo_db.path() == database :
                    dtpo_db_id = dtpo_db.id()
                    break
            if dtpo_db_id is None :
                dtpo_db = app(u'DEVONthink Pro').open_database(database)
                dtpo_db_id = dtpo_db.id()

        except AttributeError as attribute_error :
            message = "Failed to open database {0} -> {1}".format(
                import_parameters.database, str(attribute_error))
            raise ParseError(message)

        try :
            dtpo_group = app(u'DEVONthink Pro').create_location(
                import_parameters.group,
                in_=app.databases.ID(dtpo_db_id))
            # get the group to check that it's there
            dtpo_group_id = dtpo_group.id()           #pylint: disable-msg=W0612
        except AttributeError as attribute_error :
            message = "Failed access group {0} -> {1}".format(
                import_parameters.group, str(attribute_error))
            raise ParseError(message)

        try :
            doc = app(u'DEVONthink Pro').import_(
                import_parameters.source_file,
                name = document_name,
                to = dtpo_group)

            docid = doc.id()
        except AttributeError as attribute_error :
            message = "Failed import document {0} -> {1}".format(
                document_name, str(attribute_error))
            raise ParseError(message)

        try :
            app(u'DEVONthink Pro').databases.ID(
                dtpo_db_id).contents.ID(docid).unread.set(True)
            app(u'DEVONthink Pro').databases.ID(
                dtpo_db_id).contents.ID(docid).tags.set(import_parameters.tags)
            app(u'DEVONthink Pro').databases.ID(
                dtpo_db_id).contents.ID(docid).URL.set('')
            duplicate = app(u'DEVONthink Pro').databases.ID(
                dtpo_db_id).contents.ID(docid).number_of_duplicates.get()
            if int(duplicate) > 0 :
                dtpo_alert('warn', reason = '{0} duplicates of '\
                    .format(duplicate), file_name = document_name)
        except AttributeError as attribute_error :
            message = "Failed set attributes {0} -> {1}".format(
                import_parameters.get_document_name(), str(attribute_error))
            raise ParseError(message)

    except ParseError as parse_error:
        raise parse_error
    except Exception as exception :
        ex_type = type(exception)
        message = "Unexpected exception {0} -> {1}".format(
            ex_type, str(exception))
        raise Exception(message)

    return True