def export_csv_to_rdf(proj_name, input_file, encoding, json_file, output_file): """Exports CSV to RDF via OpenRefine using a JSON file.""" server = refine.RefineServer() LOGGER.debug("Connected to OpenRefine") options_json = get_options(proj_name, input_file) opts = {} new_style_options = dict(opts, **{ 'encoding': encoding, }) params = { 'options': json.dumps(new_style_options), } resp = server.urlopen('create-project-from-upload', options_json, params) url_params = urlparse.parse_qs(urlparse.urlparse(resp.geturl()).query) if 'project' in url_params: project_id = url_params['project'][0] LOGGER.debug("Created project with project id: %s", project_id) proj = refine.RefineProject(project_id) update_project_file(project_id) else: raise Exception('Project not created') apply_operations(proj, json_file) export_project(proj, output_file) if output_file: LOGGER.debug("RDF exported to: %s", output_file)
def setUp(self): self.server = refine.RefineServer() self.refine = refine.Refine(self.server) if self.project_file: self.project = self.refine.new_project( project_file=self.project_file, project_format=self.project_format, **self.project_options)
def test_init(self): server_url = 'http://' + refine.REFINE_HOST if refine.REFINE_PORT != '80': server_url += ':' + refine.REFINE_PORT self.assertEqual(self.server.server, server_url) self.assertEqual(refine.RefineServer.url(), server_url) # strip trailing / server = refine.RefineServer('http://refine.example/') self.assertEqual(server.server, 'http://refine.example')
def info(project_id): """Show project metadata""" projects = refine.Refine(refine.RefineServer()).list_projects().items() for projects_id, projects_info in projects: if project_id == projects_id: print('{0}: {1}'.format('id', projects_id)) print('{0}: {1}'.format('name', projects_info['name'])) print('{0}: {1}'.format('created', projects_info['created'])) print('{0}: {1}'.format('modified', projects_info['modified']))
def set_up(self): self.server = refine.RefineServer() self.refine = refine.Refine(self.server) if self.project_file: self.project = self.refine.new_project( project_file=self.project_path(), project_format=self.project_format, separator='|', **self.project_options)
def list_projects(): """Query the Refine server and list projects by ID: name.""" projects = refine.Refine(refine.RefineServer()).list_projects().items() def date_to_epoch(json_dt): """Convert a JSON date time into seconds-since-epoch.""" return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ')) projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True) for project_id, project_info in projects: print('{0:>14}: {1}'.format(project_id, project_info['name']))
def connect(self): try: refine.REFINE_HOST = settings.GOOGLE_REFINE_HOST refine.REFINE_PORT = settings.GOOGLE_REFINE_PORT except AttributeError: raise CommandError( 'Set GOOGLE_REFINE_HOST and GOOGLE_REFINE_PORT in settings') try: server = refine.RefineServer() refine_instance = refine.Refine(server) server.get_version() except URLError: raise CommandError('Google Refine server is not reachable.') return refine_instance
def ls(): """Query the server and list projects sorted by mtime.""" projects = refine.Refine(refine.RefineServer()).list_projects().items() def date_to_epoch(json_dt): """Convert a JSON date time into seconds-since-epoch.""" return time.mktime(time.strptime(json_dt, '%Y-%m-%dT%H:%M:%SZ')) projects.sort(key=lambda v: date_to_epoch(v[1]['modified']), reverse=True) if projects: for project_id, project_info in projects: print(u'{0:>14}: {1}'.format(project_id, project_info['name']).encode('utf-8')) else: print('Error: No projects found')
def info(project_id): """Show project metadata""" projects = refine.Refine(refine.RefineServer()).list_projects() if project_id in projects.keys(): print('{0:>20}: {1}'.format('id', project_id)) print('{0:>20}: {1}'.format( 'url', 'http://' + refine.REFINE_HOST + ':' + refine.REFINE_PORT + '/project?project=' + project_id)) for k, v in projects[project_id].items(): if v: print(u'{0:>20}: {1}'.format(k, v)) project_model = refine.RefineProject(project_id).get_models() columns = [c['name'] for c in project_model['columnModel']['columns']] for (i, v) in enumerate(columns, start=1): print(u'{0:>20}: {1}'.format(u'column ' + str(i).zfill(3), v)) else: print('Error: No project found with id %s.\n' 'Check existing projects with command --list' % (project_id))
# @in to:['Waldorf Astoria','Hamburg Amerika Linie','Norddeutscher Lloyd Bremen'] # @in Json_History_id # @in function:getToValue # @in function:getFromValue # @out outputFile @uri file: PartTest.tsv # @out projectID # @out projectNoRows # @begin CreateProject @desc create project from file # @in csvFile @uri file: partTest.csv # @in refinePythonFile @uri file: refine.py # @out projectID # @out projectNoRows from google.refine import refine projectID=refine.Refine(refine.RefineServer()).new_project('partTest.csv','HalfMenuDataset','.csv')[1] # print(refine.myParser('--list')) # @end CreateProject '''insert a function to automatically get 'from' ''' def getFromValue(computeCluster): fromlist=[] fromlistInner=[] for list3 in computeCluster: for list4 in list3: fromlistInner.append(list4['value']) fromlist.append(fromlistInner) fromlistInner=[] return fromlist
def main(): """Command line interface.""" # get environment variables in docker network docker_host = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_ADDR') if docker_host: os.environ["OPENREFINE_HOST"] = docker_host refine.REFINE_HOST = docker_host docker_port = os.environ.get('OPENREFINE_SERVER_PORT_3333_TCP_PORT') if docker_port: os.environ["OPENREFINE_PORT"] = docker_port refine.REFINE_PORT = docker_port options, args = PARSER.parse_args() commands_dict = { group2_arg.dest: getattr(options, group2_arg.dest) for group2_arg in group2.option_list } commands_dict.update({ group3_arg.dest: getattr(options, group3_arg.dest) for group3_arg in group3.option_list }) commands_dict = {k: v for k, v in commands_dict.items() if v != None} if not commands_dict: PARSER.print_usage() return if options.host: refine.REFINE_HOST = options.host if options.port: refine.REFINE_PORT = options.port if args and not str.isdigit(args[0]): projects = refine.Refine(refine.RefineServer()).list_projects().items() idlist = [] for project_id, project_info in projects: if args[0] == project_info['name']: idlist.append(str(project_id)) if len(idlist) > 1: raise Exception( 'Found at least two projects. Please specify project by id.') else: args[0] = idlist[0] if options.list: list_projects() if options.create: create_project(options) if options.delete: project = refine.RefineProject(args[0]) project.delete() if options.apply: project = refine.RefineProject(args[0]) response = project.apply_operations(options.apply) if response != 'ok': print >> sys.stderr, 'Failed to apply %s: %s' \ % (options.apply, response) return project if options.export or options.output: project = refine.RefineProject(args[0]) export_project(project, options) return project if options.info: info(args[0]) project = refine.RefineProject(args[0]) return project
def create_project(options): """Create a new project from options.create file.""" # general defaults are defined in google_refine/refine/refine.py new_project # additional defaults for each file type defaults = {} defaults['xml'] = {'project_format': 'text/xml', 'recordPath': 'record'} defaults['csv'] = { 'project_format': 'text/line-based/*sv', 'separator': ',' } defaults['tsv'] = { 'project_format': 'text/line-based/*sv', 'separator': '\t' } defaults['line-based'] = { 'project_format': 'text/line-based', 'skipDataLines': -1 } defaults['fixed-width'] = { 'project_format': 'text/line-based/fixed-width', 'headerLines': 0 } defaults['json'] = { 'project_format': 'text/json', 'recordPath': ('_', '_') } defaults['xls'] = { 'project_format': 'binary/text/xml/xls/xlsx', 'sheets': 0 } defaults['xlsx'] = { 'project_format': 'binary/text/xml/xls/xlsx', 'sheets': 0 } defaults['ods'] = {'project_format': 'text/xml/ods', 'sheets': 0} # guess format from file extension (or legacy option --format) input_format = os.path.splitext(options.create)[1][1:].lower() if input_format == 'txt' and options.columnWidths: input_format = 'fixed-width' if input_format == 'txt' and not options.columnWidths: input_format = 'line-based' if options.input_format: input_format = options.input_format # defaults for selected format input_dict = defaults[input_format] # user input input_user = { group4_arg.dest: getattr(options, group4_arg.dest) for group4_arg in group4.option_list } input_user['strings'] = { k: v for k, v in input_user.items() if v != None and v not in ['true', 'false'] } input_user['trues'] = { k: True for k, v in input_user.items() if v == 'true' } input_user['falses'] = { k: False for k, v in input_user.items() if v == 'false' } input_user_eval = input_user['strings'] input_user_eval.update(input_user['trues']) input_user_eval.update(input_user['falses']) # merge defaults with user input input_dict.update(input_user_eval) input_dict['project_file'] = options.create refine.Refine(refine.RefineServer()).new_project(**input_dict)
def main(): """Command line interface.""" options = parser.parse_args() # set environment if options.host: refine.REFINE_HOST = options.host if options.port: refine.REFINE_PORT = options.port # get project_id if options.project_id and str.isdigit(options.project_id): project_id = options.project_id elif options.project_id: projects = refine.Refine(refine.RefineServer()).list_projects().items() idlist = [] for project_id, project_info in projects: if options.project_id == project_info['name']: idlist.append(str(project_id)) if len(idlist) > 1: print('Error: Found {idlist} projects with name {name}.\n' 'Please specify project by id.'.format( idlist=len(idlist), name=options.project_id)) for i in idlist: print('') cli.info(i) return else: try: project_id = idlist[0] except IndexError: print('Error: No project found with name {}.\n' 'Try command --list'.format(options.project_id)) return # commands without args if options.list: cli.ls() elif options.download: cli.download(options.download, output_file=options.output) elif options.create: arg_dict = { arg.dest: getattr(options, arg.dest) for arg in CreateGroup._group_actions } kwargs = { k: v for k, v in arg_dict.items() if v is not None and v not in ['true', 'false'] } kwargs.update({k: True for k, v in arg_dict.items() if v == 'true'}) kwargs.update({k: False for k, v in arg_dict.items() if v == 'false'}) if options.file_format: kwargs.update({'project_format': options.file_format}) cli.create(options.create, **kwargs) # commands with args elif options.info: cli.info(project_id) elif options.delete: cli.delete(project_id) elif options.apply: cli.apply(project_id, options.apply) elif options.template: arg_dict = { arg.dest: getattr(options, arg.dest) for arg in TemplateGroup._group_actions } kwargs = { k: v for k, v in arg_dict.items() if v is not None and v not in ['true', 'false'] } kwargs.update({k: True for k, v in arg_dict.items() if v == 'true'}) kwargs.update({k: False for k, v in arg_dict.items() if v == 'false'}) if options.file_format: kwargs.update({'project_format': options.file_format}) elif options.export or options.output: cli.export(project_id, output_file=options.output, export_format=options.file_format) else: parser.print_help()
def create(project_file, project_format=None, columnWidths=None, encoding=None, guessCellValueTypes=False, headerLines=None, ignoreLines=None, includeFileSources=False, limit=None, linesPerRow=None, processQuotes=True, projectName=None, projectTags=None, recordPath=None, separator=None, sheets=None, skipDataLines=None, storeBlankCellsAsNulls=True, storeBlankRows=True, storeEmptyStrings=True, trimStrings=False): """Create a new project from file.""" # guess format from file extension if not project_format: project_format = os.path.splitext(project_file)[1][1:].lower() if project_format == 'txt': try: columnWidths[0] project_format = 'fixed-width' except TypeError: project_format = 'line-based' # defaults for each file type if project_format == 'xml': project_format = 'text/xml' if not recordPath: recordPath = [ElementTree.parse(project_file).getroot().tag] elif project_format == 'csv': project_format = 'text/line-based/*sv' elif project_format == 'tsv': project_format = 'text/line-based/*sv' if not separator: separator = '\t' elif project_format == 'line-based': project_format = 'text/line-based' if not skipDataLines: skipDataLines = -1 elif project_format == 'fixed-width': project_format = 'text/line-based/fixed-width' if not headerLines: headerLines = 0 elif project_format == 'json': project_format = 'text/json' if not recordPath: recordPath = ['_', '_'] elif project_format == 'xls': project_format = 'binary/text/xml/xls/xlsx' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 elif project_format == 'xlsx': project_format = 'binary/text/xml/xls/xlsx' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 elif project_format == 'ods': project_format = 'text/xml/ods' if not sheets: sheets = [0] # TODO: new format for sheets option introduced in OpenRefine 2.8 # execute kwargs = {k: v for k, v in vars().items() if v is not None} project = refine.Refine(refine.RefineServer()).new_project( guess_cell_value_types=guessCellValueTypes, ignore_lines=ignoreLines, header_lines=headerLines, skip_data_lines=skipDataLines, store_blank_rows=storeBlankRows, process_quotes=processQuotes, project_name=projectName, store_blank_cells_as_nulls=storeBlankCellsAsNulls, include_file_sources=includeFileSources, **kwargs) rows = project.do_json('get-rows')['total'] if rows > 0: print('{0}: {1}'.format('id', project.project_id)) print('{0}: {1}'.format('rows', rows)) return project else: raise Exception( 'Project contains 0 rows. Please check --help for mandatory ' 'arguments for xml, json, xlsx and ods')
first, count, record = True, 0, '' with open(DATA, 'rb') as inp: with open(PROCESSED, 'wb') as out: for line in inp: if first: first = False continue if count == 50: out.write(record.rstrip('#####')+'\n') count, record = 0, '' record += line.strip() + '#####' count += 1 if record: out.write(record.rstrip('#####')+'\n') # Now create the sample project using sampled data sampling_ratio = 0.1 no_of_sampled_lines = int(sampling_ratio * no_of_lines) subprocess.Popen(['cp', HEADER, SAMPLED]).communicate() with open(SAMPLED, 'ab') as out: subprocess.Popen(['shuf', '-n {0}'.format(no_of_sampled_lines), \ DATA], stdout=out).communicate() server = refine.RefineServer() refine = refine.Refine(server) project = refine.new_project(project_file=SAMPLED, \ project_format=_format,\ project_options=_options) print "Done" print "Open: " + project.project_url()
def main(): """Command line interface.""" options, args = PARSER.parse_args() # set environment if options.host: refine.REFINE_HOST = options.host if options.port: refine.REFINE_PORT = options.port # get project_id if args and not str.isdigit(args[0]): projects = refine.Refine(refine.RefineServer()).list_projects().items() idlist = [] for project_id, project_info in projects: if args[0].decode('UTF-8') == project_info['name']: idlist.append(str(project_id)) if len(idlist) > 1: print('Error: Found %s projects with name %s.\n' 'Please specify project by id.' % (len(idlist), args[0])) for i in idlist: print('') cli.info(i) return else: try: project_id = idlist[0] except IndexError: print('Error: No project found with name %s.\n' 'Try command --list' % args[0]) return elif args: project_id = args[0] # commands without args if options.list: cli.ls() elif options.download: cli.download(options.download, output_file=options.output) elif options.create: group5_dict = { group5_arg.dest: getattr(options, group5_arg.dest) for group5_arg in group5.option_list } kwargs = { k: v for k, v in group5_dict.items() if v is not None and v not in ['true', 'false'] } kwargs.update({k: True for k, v in group5_dict.items() if v == 'true'}) kwargs.update( {k: False for k, v in group5_dict.items() if v == 'false'}) if options.file_format: kwargs.update({'project_format': options.file_format}) cli.create(options.create, **kwargs) # commands with args elif args and options.info: cli.info(project_id) elif args and options.delete: cli.delete(project_id) elif args and options.apply: cli.apply(project_id, options.apply) elif args and options.template: group6_dict = { group6_arg.dest: getattr(options, group6_arg.dest) for group6_arg in group6.option_list } kwargs = { k: v for k, v in group6_dict.items() if v is not None and v not in ['true', 'false'] } kwargs.update({k: True for k, v in group6_dict.items() if v == 'true'}) kwargs.update( {k: False for k, v in group6_dict.items() if v == 'false'}) cli.templating(project_id, options.template, output_file=options.output, **kwargs) elif args and (options.export or options.output): cli.export(project_id, output_file=options.output, export_format=options.file_format) else: PARSER.print_usage()