Ejemplo n.º 1
0
    def command(self):
        self._load_config()
        self.tika_parser = Tika_Wrapper_Singleton()

        options = {
            'show': self.showCmd,
            'import': self.importCmd,
            'help': self.helpCmd,
        }

        try:
            cmd = self.args[0]
            options[cmd](*self.args[1:])
        except (IndexError, KeyError):
            self.helpCmd()
Ejemplo n.º 2
0
class NeatCommand(ckan.lib.cli.CkanCommand):
    '''Command to import NEAT data

    Usage::

            paster --plugin="ckanext-neat" neat show /tmp/neat-files -c <path to config file>
            paster --plugin="ckanext-neat" neat import /tmp/neat-files -c <path to config file>

    '''
    summary = __doc__.split('\n')[0]
    usage = __doc__

    def command(self):
        self._load_config()
        self.tika_parser = Tika_Wrapper_Singleton()

        options = {
            'show': self.showCmd,
            'import': self.importCmd,
            'help': self.helpCmd,
        }

        try:
            cmd = self.args[0]
            options[cmd](*self.args[1:])
        except (IndexError, KeyError):
            self.helpCmd()

    def helpCmd(self):
        print self.__doc__

    def _ckan_connect(self):
        return MyLocalCKAN(context={'user': '******'})
        # return ckanapi.RemoteCKAN('http://neat.lo',
        #                     apikey='df3163fc-da37-4c8a-a8b7-f1c22bbeda58')

    def showCmd(self, path=None):
        if (path is None):
            print "Argument 'path' must be set"
            self.helpCmd()
            sys.exit(1)
        for root, dirs, files in os.walk(path):
            for dir_name in dirs:
                print "Package Name: %s" % dir_name
                dir_path = os.path.join(path, dir_name)
                for file_name in os.listdir(dir_path):
                    if (os.path.isfile(os.path.join(dir_path, file_name)) and
                        file_name != 'Thumbs.db'):
                        print "Ressource: %s" % file_name
            break
    
    def importCmd(self, path=None):
        self.ckan = self._ckan_connect()
        
        if (path is None):
            print "Argument 'path' must be set"
            self.helpCmd()
            sys.exit(1)
        
        for root, dirs, files in os.walk(path):
            for dir_name in dirs:
                try:
                    dir_path = os.path.join(root, dir_name)
                    print "dir_path: %s" % dir_path
                    for file_name in os.listdir(dir_path):
                        file_path = os.path.join(dir_path, file_name)
                        if not file_path.endswith('.pdf') or not os.path.isfile(file_path):
                            continue

                        base_name = file_name.split('.')[0]
                        meta_xml_path = os.path.join(dir_path, base_name + '.xml')

                        metadata = self._parse_metadata(meta_xml_path)

                        # read fulltext with tika
                        metadata['full_text_search'] = self.tika_parser.parse_with_tika(file_path)
                        print "FULLTEXT: %s" % metadata['full_text_search']

                        # add tags to structure
                        tags = [
                            metadata.get('source', '').replace('#', ' ').replace('-', ' '),
                            metadata.get('contributor'),
                            metadata.get('creator'),
                            metadata.get('publisher'),
                            metadata.get('pdf_image_color_mode'),
                            metadata.get('pdf_image_color_space'),
                            metadata.get('pdf_image_format'),
                            metadata.get('pdf_image_resolution'),
                        ]
                        tags = [munge_tag(tag) for tag in tags if tag and tag is not None]
                        metadata['tags'] = [{'name': tag} for tag in set(tags)]
                        
                        pkg = self._create_or_update_package(base_name, metadata)
                        self._attach_file(pkg['id'], file_name, file_name, file_path, metadata, 'PDF')
                        self._attach_file(pkg['id'], base_name + '.xml', 'Metadata XML', meta_xml_path, format='XML')
                except Exception, e:
                    traceback.print_exc()