def command(self): self._load_config() self.tika_parser = Tika_Wrapper_Singleton() options = { 'show': self.showCmd, 'import': self.importCmd, 'help': self.helpCmd, } try: cmd = self.args[0] options[cmd](*self.args[1:]) except (IndexError, KeyError): self.helpCmd()
class NeatCommand(ckan.lib.cli.CkanCommand): '''Command to import NEAT data Usage:: paster --plugin="ckanext-neat" neat show /tmp/neat-files -c <path to config file> paster --plugin="ckanext-neat" neat import /tmp/neat-files -c <path to config file> ''' summary = __doc__.split('\n')[0] usage = __doc__ def command(self): self._load_config() self.tika_parser = Tika_Wrapper_Singleton() options = { 'show': self.showCmd, 'import': self.importCmd, 'help': self.helpCmd, } try: cmd = self.args[0] options[cmd](*self.args[1:]) except (IndexError, KeyError): self.helpCmd() def helpCmd(self): print self.__doc__ def _ckan_connect(self): return MyLocalCKAN(context={'user': '******'}) # return ckanapi.RemoteCKAN('http://neat.lo', # apikey='df3163fc-da37-4c8a-a8b7-f1c22bbeda58') def showCmd(self, path=None): if (path is None): print "Argument 'path' must be set" self.helpCmd() sys.exit(1) for root, dirs, files in os.walk(path): for dir_name in dirs: print "Package Name: %s" % dir_name dir_path = os.path.join(path, dir_name) for file_name in os.listdir(dir_path): if (os.path.isfile(os.path.join(dir_path, file_name)) and file_name != 'Thumbs.db'): print "Ressource: %s" % file_name break def importCmd(self, path=None): self.ckan = self._ckan_connect() if (path is None): print "Argument 'path' must be set" self.helpCmd() sys.exit(1) for root, dirs, files in os.walk(path): for dir_name in dirs: try: dir_path = os.path.join(root, dir_name) print "dir_path: %s" % dir_path for file_name in os.listdir(dir_path): file_path = os.path.join(dir_path, file_name) if not file_path.endswith('.pdf') or not os.path.isfile(file_path): continue base_name = file_name.split('.')[0] meta_xml_path = os.path.join(dir_path, base_name + '.xml') metadata = self._parse_metadata(meta_xml_path) # read fulltext with tika metadata['full_text_search'] = self.tika_parser.parse_with_tika(file_path) print "FULLTEXT: %s" % metadata['full_text_search'] # add tags to structure tags = [ metadata.get('source', '').replace('#', ' ').replace('-', ' '), metadata.get('contributor'), metadata.get('creator'), metadata.get('publisher'), metadata.get('pdf_image_color_mode'), metadata.get('pdf_image_color_space'), metadata.get('pdf_image_format'), metadata.get('pdf_image_resolution'), ] tags = [munge_tag(tag) for tag in tags if tag and tag is not None] metadata['tags'] = [{'name': tag} for tag in set(tags)] pkg = self._create_or_update_package(base_name, metadata) self._attach_file(pkg['id'], file_name, file_name, file_path, metadata, 'PDF') self._attach_file(pkg['id'], base_name + '.xml', 'Metadata XML', meta_xml_path, format='XML') except Exception, e: traceback.print_exc()