def import_csv(dataset, url, args): """ Import the csv data into the dataset """ csv_data_url, source_url = url source = Source(dataset, shell_account(), csv_data_url) # Analyse the csv data and add it to the source # If we don't analyse it we'll be left with a weird message source.analysis = analyze_csv(csv_data_url) # Check to see if the dataset already has this source for source_ in dataset.sources: if source_.url == csv_data_url: source = source_ break db.session.add(source) db.session.commit() dataset.generate() importer = CSVImporter(source) importer.run(**vars(args)) # Check if imported from the file system (source and data url differ) if csv_data_url != source_url: # If we did, then we must update the source url based on the # sources in the dataset model (so we need to fetch the source again # or else we'll add a new one) source = Source.by_id(source.id) source.url = source_url db.session.commit()
def test_dimensions_edit_mask_with_data(self): cra = Dataset.by_name('cra') src = Source(cra, self.user, 'file:///dev/null') src.analysis = {'columns': ['amount', 'etc']} db.session.add(src) db.session.commit() response = self.app.get(url(controller='editor', action='dimensions_edit', dataset='cra'), extra_environ={'REMOTE_USER': '******'}) assert 'cannot edit dimensions' in response.body assert '"amount"' not in response.body assert 'Update' not in response.body
def model(datasetname): #if not sourcename then we are saving the defaults for dataset dataset = get_dataset(datasetname) if not dataset.source: #then create one dataset_source = Source.by_source_name(dataset.name) if not dataset_source: dataset_source = Source(name=dataset.name, dataset=dataset) db.session.add(dataset_source) else: dataset_source.dataset = dataset db.session.commit() #figure out what they need over there? return jsonify(dataset.source)
def create(): """ This takes a json format post with label, name, description and creates a private dataset to put sources in The json_errors return a json object """ if not require.dataset.create(): return jsonify( {"errors": ["Can not create new dataset. Permission denied"]}) try: dataset = api_form_data() if not dataset.get("dataorg", None): return jsonify( {"errors": ["You must select the data source organization"]}) model = {'data': dataset} schema = dataset_schema(ValidationState(model)) data = schema.deserialize(dataset) #should have a better place for sluggify if (data.get('name', None)): tempname = slugify(str(data.get('name')), max_length=50) else: tempname = slugify(str(data.get('label')), max_length=50) if Dataset.by_name(tempname) is not None: return jsonify( {"errors": ["A dataset with this name already exists "]}) dataset = Dataset(data=data) dataset.managers.append(current_user) db.session.add(dataset) dataset_source = Source.by_source_name(dataset.name) if not dataset_source: dataset_source = Source(dataset=dataset, name=dataset.name) db.session.add(dataset_source) else: dataset_source.dataset = dataset #creating a new dataset so we have to create a source as well db.session.commit() return jsonify({"success": True, "dataset": dataset.name}) except Exception, e: ex_type, ex, tb = sys.exc_info() print traceback.print_tb(tb) return jsonify({"errors": ['Unknown Error has occurred: ' + str(e)]})
def create(self): """ Adds a new dataset dynamically through a POST request """ # User must be authenticated so we should have a user object in # c.account, if not abort with error message if not c.account: abort(status_code=400, detail='user not authenticated') # Check if the params are there ('metadata', 'csv_file') if len(request.params) != 2: abort(status_code=400, detail='incorrect number of params') metadata = request.params['metadata'] \ if 'metadata' in request.params \ else abort(status_code=400, detail='metadata is missing') csv_file = request.params['csv_file'] \ if 'csv_file' in request.params \ else abort(status_code=400, detail='csv_file is missing') # We proceed with the dataset try: model = json.load(urllib2.urlopen(metadata)) except: abort(status_code=400, detail='JSON model could not be parsed') try: log.info("Validating model") model = validate_model(model) except Invalid as i: log.error("Errors occured during model validation:") for field, error in i.asdict().items(): log.error("%s: %s", field, error) abort(status_code=400, detail='Model is not well formed') dataset = Dataset.by_name(model['dataset']['name']) if dataset is None: dataset = Dataset(model) require.dataset.create() dataset.managers.append(c.account) dataset.private = True # Default value db.session.add(dataset) else: require.dataset.update(dataset) log.info("Dataset: %s", dataset.name) source = Source(dataset=dataset, creator=c.account, url=csv_file) log.info(source) for source_ in dataset.sources: if source_.url == csv_file: source = source_ break db.session.add(source) db.session.commit() # Send loading of source into celery queue load_source.delay(source.id) return to_jsonp(dataset_apply_links(dataset.as_dict()))
def _get_run(self, dataset, source, id): self._get_dataset(dataset) require.dataset.update(c.dataset) c.source = Source.by_id(source) if c.source is None or c.source.dataset != c.dataset: abort(404, _("There is no source '%s'") % source) c.run = Run.by_id(id) if c.run is None or c.run.source != c.source: abort(404, _("There is no run '%s'") % id)
def create(): """ This takes a json format post with label, name, description and creates a private dataset to put sources in The json_errors return a json object """ if not require.dataset.create(): return jsonify({"errors":["Can not create new dataset. Permission denied"]}) try: dataset = api_form_data() if not dataset.get("dataorg", None): return jsonify({"errors":["You must select the data source organization"]}) model = {'data': dataset} schema = dataset_schema(ValidationState(model)) data = schema.deserialize(dataset) #should have a better place for sluggify if (data.get('name', None)): tempname = slugify(str(data.get('name')), max_length=50) else: tempname = slugify(str(data.get('label')), max_length=50) if Dataset.by_name(tempname) is not None: return jsonify({"errors":["A dataset with this name already exists "]}) dataset = Dataset(data=data) dataset.managers.append(current_user) db.session.add(dataset) dataset_source = Source.by_source_name(dataset.name) if not dataset_source: dataset_source = Source(dataset=dataset, name=dataset.name) db.session.add(dataset_source) else: dataset_source.dataset = dataset #creating a new dataset so we have to create a source as well db.session.commit() return jsonify({"success":True, "dataset":dataset.name}) except Exception, e: ex_type, ex, tb = sys.exc_info() print traceback.print_tb(tb) return jsonify({"errors":['Unknown Error has occurred: ' + str(e)]})
def test_view_source(self): url_ = 'http://banana.com/split.csv' source = Source(self.dataset, self.user, url_) db.session.add(source) db.session.commit() response = self.app.get(url(controller='source', action='view', dataset='cra', id=source.id), extra_environ={'REMOTE_USER': '******'}) assert response.headers['Location'] == url_, response.headers
def analyze_source(source_id): from openspending.model import Source, meta as db from openspending.importer.analysis import analyze_csv source = Source.by_id(source_id) if not source: log.error("No such source: %s", source_id) log.info("Analyzing: %s", source.url) source.analysis = analyze_csv(source.url) if 'error' in source.analysis: log.error(source.analysis.get('error')) else: log.info("Columns: %r", source.analysis.get('columns')) db.session.commit()
def csvimport_fixture(name): model_fp = csvimport_fixture_file(name, 'model.json') mapping_fp = csvimport_fixture_file(name, 'mapping.json') model = json.load(model_fp) if mapping_fp: model['mapping'] = json.load(mapping_fp) dataset = Dataset(model) dataset.generate() db.session.add(dataset) data_path = csvimport_fixture_path(name, 'data.csv') user = make_account() source = Source(dataset, user, data_path) db.session.add(source) db.session.commit() return source
def update(archive_dir, dataset=None): """ Download all sources into an archive directory. If dataset parameter is provided only sources for that dataset will be fetched (otherwise all source in the database will be fetched) """ # Create archive directory if it doesn't exist if not os.path.isdir(archive_dir): os.makedirs(archive_dir) # If a dataset is provided we limit to only its sources (else we take all) sources = Source.all() if dataset is None else dataset.sources # Update each source for source in sources: update_source(archive_dir, source)
def load_source(source_id, sample=False): from openspending.model import Source from openspending.importer import CSVImporter source = Source.by_id(source_id) if not source: log.error("No such source: %s", source_id) if not source.loadable: log.error("Dataset has no mapping.") return source.dataset.generate() importer = CSVImporter(source) if sample: importer.run(dry_run=True, max_lines=1000, max_errors=1000) else: importer.run() index_dataset.delay(source.dataset.name)
def load_source(source_id, sample=False): from openspending.model import Source from openspending.importer import CSVImporter source = Source.by_id(source_id) if not source: log.error("No such source: %s", source_id) if not source.loadable: log.error("Dataset has no mapping.") return source.dataset.generate() importer = CSVImporter(source) if sample: importer.run(max_lines=1000, max_errors=1000) else: importer.run() index_dataset.delay(source.dataset.name)
def create(self, dataset): self._get_dataset(dataset) require.dataset.update(c.dataset) try: schema = source_schema() data = schema.deserialize(request.params) source = Source(c.dataset, c.account, data['url']) db.session.add(source) db.session.commit() analyze_source.apply_async(args=[source.id], countdown=2) h.flash_success(_("The source has been created.")) redirect( h.url_for(controller='editor', action='index', dataset=c.dataset.name)) except Invalid as i: errors = i.asdict() errors = [(k[len('source.'):], v) for k, v in errors.items()] return self.new(dataset, dict(errors))
def output_json(**args): """ Output JSON data """ outputfile = args.get('outputfile', None) if len(outputfile) != 1: print "You need to specific one and only one output file" return outputfile = outputfile[0] #need to load in this order for relations #metadataorg #dataorg #source #sourcefile #wrap up files #dataset outputobj = [] for metadataorg in MetadataOrg.all().all(): outputobj.append(metadataorg.to_json_dump()) for dataorg in DataOrg.all().all(): outputobj.append(dataorg.to_json_dump()) for source in Source.all().all(): outputobj.append(source.to_json_dump()) for sourcefile in SourceFile.all().all(): outputobj.append(sourcefile.to_json_dump()) for dataset in Dataset.all().all(): outputobj.append(dataset.to_json_dump()) with open(outputfile, 'wb') as f: json.dump(outputobj, f) print "success" print "written to ", outputfile
if len(request.files) == 1: upload_source_path = sourcefiles.save(request.files['sourcefile']) sourcefile = SourceFile(rawfile=upload_source_path) db.session.add(sourcefile) if basesource: if basesource.rawfile: basesource.rawfile.delete() basesource.rawfile = sourcefile source = basesource source.reload_openrefine() else: source = Source(dataset=dataset, name=data['name'], url=None, rawfile=sourcefile) db.session.add(source) #handle file elif data.get('url', None): if basesource: source = basesource source.name = data['name'] source.url = data['url'] source.reload_openrefine() #maybe reload the OpenRefine? #trigger reload else: source = Source(dataset=dataset, name=data['name'],
orig_filepath = os.path.join(file_dir, filename) with codecs.open(orig_filepath, 'rb') as fh: wuezfile = FileStorage(stream=fh) #upload_source_path = sourcefiles.save(wuezfile, name=filename, folder=UPLOADED_FILES_DEST) upload_source_path = sourcefiles.save(wuezfile, name=filename) sourcefile = SourceFile(rawfile=upload_source_path) db.session.add(sourcefile) except Exception, e: print "!!!!!Error failed", e return (None, False) try: print sourcefile source = Source(dataset=dataset, name=dataset.name, url=None, rawfile=sourcefile) except Exception, e: traceback.print_exc(e) print "Could not load source rawfile", e return (None, False) else: try: source = Source(dataset=dataset, name=dataset.name, url=sourcejson['fields']['webservice'], rawfile=None) except Exception, e: print "Could not load source webservice", e
def get_source(sourcename): source = obj_or_404(Source.by_source_name(sourcename)) #require.dataset.read(dataset) return source
upload_source_path = sourcefiles.save(request.files['sourcefile']) sourcefile = SourceFile(rawfile=upload_source_path) db.session.add(sourcefile) oroperations = None if basesource: if basesource.rawfile: basesource.rawfile.delete() basesource.rawfile = sourcefile source = basesource oroperations = source.getORInstructions() source.reload_openrefine() else: source = Source(dataset=dataset, name=data['name'], url=None, rawfile=sourcefile) db.session.add(source) #handle file elif data.get('url', None): if basesource: source = basesource source.name = data['name'] source.url = data['url'] oroperations = source.getORInstructions() source.reload_openrefine() #maybe reload the OpenRefine? #trigger reload else: source = Source(dataset=dataset,
def _get_source(self, dataset, id): self._get_dataset(dataset) c.source = Source.by_id(id) if c.source is None or c.source.dataset != c.dataset: abort(404, _("There is no source '%s'") % id)
def update(archive_dir): if not os.path.isdir(archive_dir): os.makedirs(archive_dir) for source in Source.all(): update_source(archive_dir, source)
upload_source_path = sourcefiles.save(request.files['sourcefile']) sourcefile = SourceFile(rawfile = upload_source_path) db.session.add(sourcefile) oroperations = None if basesource: if basesource.rawfile: basesource.rawfile.delete() basesource.rawfile = sourcefile source = basesource oroperations = source.getORInstructions() source.reload_openrefine() else: source = Source(dataset=dataset, name=data['name'], url=None, rawfile=sourcefile) db.session.add(source) #handle file elif data.get('url', None): if basesource: source = basesource source.name = data['name'] source.url = data['url'] oroperations = source.getORInstructions() source.reload_openrefine() #maybe reload the OpenRefine? #trigger reload else: source = Source(dataset=dataset, name=data['name'], url=data['url']) db.session.add(source)
if len(request.files) == 1: upload_source_path = sourcefiles.save(request.files['sourcefile']) sourcefile = SourceFile(rawfile = upload_source_path) db.session.add(sourcefile) if basesource: if basesource.rawfile: basesource.rawfile.delete() basesource.rawfile = sourcefile source = basesource source.reload_openrefine() else: source = Source(dataset=dataset, name=data['name'], url=None, rawfile=sourcefile) db.session.add(source) #handle file elif data.get('url', None): if basesource: source = basesource source.name = data['name'] source.url = data['url'] source.reload_openrefine() #maybe reload the OpenRefine? #trigger reload else: source = Source(dataset=dataset, name=data['name'], url=data['url']) db.session.add(source) else:
try: log.info("Validating model") model = validate_model(model) except Invalid, i: log.error("Errors occured during model validation:") for field, error in i.asdict().items(): log.error("%s: %s", field, error) return 1 dataset = Dataset.by_name(model['dataset']['name']) if dataset is None: dataset = Dataset(model) db.session.add(dataset) log.info("Dataset: %s", dataset.name) source = Source(dataset, shell_account(), csv_data_url) for source_ in dataset.sources: if source_.url == csv_data_url: source = source_ break db.session.add(source) db.session.commit() dataset.generate() importer = CSVImporter(source) importer.run(**vars(args)) return 0 def _csvimport(args): return csvimport(args.dataset_url, args)