Example #1
0
def import_csv(dataset, url, args):
    """
    Import the csv data into the dataset
    """

    csv_data_url, source_url = url 
    source = Source(dataset, shell_account(), 
                    csv_data_url)
    # Analyse the csv data and add it to the source
    # If we don't analyse it we'll be left with a weird message
    source.analysis = analyze_csv(csv_data_url)
    # Check to see if the dataset already has this source
    for source_ in dataset.sources:
        if source_.url == csv_data_url:
            source = source_
            break
    db.session.add(source)
    db.session.commit()
    
    dataset.generate()
    importer = CSVImporter(source)
    importer.run(**vars(args))

    # Check if imported from the file system (source and data url differ)
    if csv_data_url != source_url:
        # If we did, then we must update the source url based on the
        # sources in the dataset model (so we need to fetch the source again
        # or else we'll add a new one)
        source = Source.by_id(source.id)
        source.url = source_url
        db.session.commit()
Example #2
0
def import_csv(dataset, url, args):
    """
    Import the csv data into the dataset
    """

    csv_data_url, source_url = url
    source = Source(dataset, shell_account(), csv_data_url)
    # Analyse the csv data and add it to the source
    # If we don't analyse it we'll be left with a weird message
    source.analysis = analyze_csv(csv_data_url)
    # Check to see if the dataset already has this source
    for source_ in dataset.sources:
        if source_.url == csv_data_url:
            source = source_
            break
    db.session.add(source)
    db.session.commit()

    dataset.generate()
    importer = CSVImporter(source)
    importer.run(**vars(args))

    # Check if imported from the file system (source and data url differ)
    if csv_data_url != source_url:
        # If we did, then we must update the source url based on the
        # sources in the dataset model (so we need to fetch the source again
        # or else we'll add a new one)
        source = Source.by_id(source.id)
        source.url = source_url
        db.session.commit()
Example #3
0
 def test_dimensions_edit_mask_with_data(self):
     cra = Dataset.by_name('cra')
     src = Source(cra, self.user, 'file:///dev/null')
     src.analysis = {'columns': ['amount', 'etc']}
     db.session.add(src)
     db.session.commit()
     response = self.app.get(url(controller='editor',
                                 action='dimensions_edit', dataset='cra'),
                             extra_environ={'REMOTE_USER': '******'})
     assert 'cannot edit dimensions' in response.body
     assert '"amount"' not in response.body
     assert 'Update' not in response.body
Example #4
0
 def test_dimensions_edit_mask_with_data(self):
     cra = Dataset.by_name('cra')
     src = Source(cra, self.user, 'file:///dev/null')
     src.analysis = {'columns': ['amount', 'etc']}
     db.session.add(src)
     db.session.commit()
     response = self.app.get(url(controller='editor', 
         action='dimensions_edit', dataset='cra'),
         extra_environ={'REMOTE_USER': '******'})
     assert 'cannot edit dimensions' in response.body
     assert '"amount"' not in response.body
     assert 'Update' not in response.body
Example #5
0
def model(datasetname):
    #if not sourcename then we are saving the defaults for dataset
    
    dataset = get_dataset(datasetname)
    if not dataset.source:
        #then create one
        dataset_source = Source.by_source_name(dataset.name)
        if not dataset_source:
            dataset_source = Source(name=dataset.name, dataset=dataset)
            db.session.add(dataset_source)
        else:
            dataset_source.dataset = dataset
        db.session.commit()

        #figure out what they need over there?
    return jsonify(dataset.source)
Example #6
0
def model(datasetname):
    #if not sourcename then we are saving the defaults for dataset

    dataset = get_dataset(datasetname)
    if not dataset.source:
        #then create one
        dataset_source = Source.by_source_name(dataset.name)
        if not dataset_source:
            dataset_source = Source(name=dataset.name, dataset=dataset)
            db.session.add(dataset_source)
        else:
            dataset_source.dataset = dataset
        db.session.commit()

        #figure out what they need over there?
    return jsonify(dataset.source)
Example #7
0
def create():
    """
    This takes a json format post with label, name, description
    and creates a private dataset to put sources in
    The json_errors return a json object
    """

    if not require.dataset.create():
        return jsonify(
            {"errors": ["Can not create new dataset.  Permission denied"]})

    try:
        dataset = api_form_data()
        if not dataset.get("dataorg", None):
            return jsonify(
                {"errors": ["You must select the data source organization"]})
        model = {'data': dataset}
        schema = dataset_schema(ValidationState(model))
        data = schema.deserialize(dataset)

        #should have a better place for sluggify
        if (data.get('name', None)):
            tempname = slugify(str(data.get('name')), max_length=50)
        else:
            tempname = slugify(str(data.get('label')), max_length=50)

        if Dataset.by_name(tempname) is not None:
            return jsonify(
                {"errors": ["A dataset with this name already exists "]})

        dataset = Dataset(data=data)
        dataset.managers.append(current_user)
        db.session.add(dataset)

        dataset_source = Source.by_source_name(dataset.name)
        if not dataset_source:
            dataset_source = Source(dataset=dataset, name=dataset.name)
            db.session.add(dataset_source)
        else:
            dataset_source.dataset = dataset
        #creating a new dataset so we have to create a source as well
        db.session.commit()
        return jsonify({"success": True, "dataset": dataset.name})
    except Exception, e:
        ex_type, ex, tb = sys.exc_info()
        print traceback.print_tb(tb)
        return jsonify({"errors": ['Unknown Error has occurred: ' + str(e)]})
Example #8
0
    def create(self):
        """
        Adds a new dataset dynamically through a POST request
        """

        # User must be authenticated so we should have a user object in
        # c.account, if not abort with error message
        if not c.account:
            abort(status_code=400, detail='user not authenticated')

        # Check if the params are there ('metadata', 'csv_file')
        if len(request.params) != 2:
            abort(status_code=400, detail='incorrect number of params')

        metadata = request.params['metadata'] \
            if 'metadata' in request.params \
            else abort(status_code=400, detail='metadata is missing')

        csv_file = request.params['csv_file'] \
            if 'csv_file' in request.params \
            else abort(status_code=400, detail='csv_file is missing')

        # We proceed with the dataset
        try:
            model = json.load(urllib2.urlopen(metadata))
        except:
            abort(status_code=400, detail='JSON model could not be parsed')
        try:
            log.info("Validating model")
            model = validate_model(model)
        except Invalid as i:
            log.error("Errors occured during model validation:")
            for field, error in i.asdict().items():
                log.error("%s: %s", field, error)
            abort(status_code=400, detail='Model is not well formed')
        dataset = Dataset.by_name(model['dataset']['name'])
        if dataset is None:
            dataset = Dataset(model)
            require.dataset.create()
            dataset.managers.append(c.account)
            dataset.private = True  # Default value
            db.session.add(dataset)
        else:
            require.dataset.update(dataset)

        log.info("Dataset: %s", dataset.name)
        source = Source(dataset=dataset, creator=c.account, url=csv_file)

        log.info(source)
        for source_ in dataset.sources:
            if source_.url == csv_file:
                source = source_
                break
        db.session.add(source)
        db.session.commit()

        # Send loading of source into celery queue
        load_source.delay(source.id)
        return to_jsonp(dataset_apply_links(dataset.as_dict()))
Example #9
0
 def _get_run(self, dataset, source, id):
     self._get_dataset(dataset)
     require.dataset.update(c.dataset)
     c.source = Source.by_id(source)
     if c.source is None or c.source.dataset != c.dataset:
         abort(404, _("There is no source '%s'") % source)
     c.run = Run.by_id(id)
     if c.run is None or c.run.source != c.source:
         abort(404, _("There is no run '%s'") % id)
Example #10
0
def create():
    """
    This takes a json format post with label, name, description
    and creates a private dataset to put sources in
    The json_errors return a json object
    """

    if not require.dataset.create():
        return jsonify({"errors":["Can not create new dataset.  Permission denied"]})

    try:
        dataset = api_form_data()
        if not dataset.get("dataorg", None):
            return jsonify({"errors":["You must select the data source organization"]}) 
        model = {'data': dataset}
        schema = dataset_schema(ValidationState(model))
        data = schema.deserialize(dataset)

        #should have a better place for sluggify
        if (data.get('name', None)):
            tempname = slugify(str(data.get('name')), max_length=50)
        else:
            tempname = slugify(str(data.get('label')), max_length=50)

        if Dataset.by_name(tempname) is not None:
            return jsonify({"errors":["A dataset with this name already exists "]})

        dataset = Dataset(data=data)
        dataset.managers.append(current_user)
        db.session.add(dataset)
        
        dataset_source = Source.by_source_name(dataset.name)
        if not dataset_source:
            dataset_source = Source(dataset=dataset, name=dataset.name)
            db.session.add(dataset_source)
        else:
            dataset_source.dataset = dataset
        #creating a new dataset so we have to create a source as well
        db.session.commit()
        return jsonify({"success":True, "dataset":dataset.name})
    except Exception, e:
        ex_type, ex, tb = sys.exc_info()
        print traceback.print_tb(tb)
        return jsonify({"errors":['Unknown Error has occurred: ' + str(e)]})
Example #11
0
 def test_view_source(self):
     url_ = 'http://banana.com/split.csv'
     source = Source(self.dataset, self.user, url_)
     db.session.add(source)
     db.session.commit()
     response = self.app.get(url(controller='source',
                                 action='view',
                                 dataset='cra',
                                 id=source.id),
                             extra_environ={'REMOTE_USER': '******'})
     assert response.headers['Location'] == url_, response.headers
Example #12
0
def analyze_source(source_id):
    from openspending.model import Source, meta as db
    from openspending.importer.analysis import analyze_csv
    source = Source.by_id(source_id)
    if not source:
        log.error("No such source: %s", source_id)
    log.info("Analyzing: %s", source.url)
    source.analysis = analyze_csv(source.url)
    if 'error' in source.analysis:
        log.error(source.analysis.get('error'))
    else:
        log.info("Columns: %r", source.analysis.get('columns'))
    db.session.commit()
Example #13
0
def analyze_source(source_id):
    from openspending.model import Source, meta as db
    from openspending.importer.analysis import analyze_csv
    source = Source.by_id(source_id)
    if not source:
        log.error("No such source: %s", source_id)
    log.info("Analyzing: %s", source.url)
    source.analysis = analyze_csv(source.url)
    if 'error' in source.analysis:
        log.error(source.analysis.get('error'))
    else:
        log.info("Columns: %r", source.analysis.get('columns'))
    db.session.commit()
Example #14
0
def csvimport_fixture(name):
    model_fp = csvimport_fixture_file(name, 'model.json')
    mapping_fp = csvimport_fixture_file(name, 'mapping.json')
    model = json.load(model_fp)
    if mapping_fp:
        model['mapping'] = json.load(mapping_fp)
    dataset = Dataset(model)
    dataset.generate()
    db.session.add(dataset)
    data_path = csvimport_fixture_path(name, 'data.csv')
    user = make_account()
    source = Source(dataset, user, data_path)
    db.session.add(source)
    db.session.commit()
    return source
Example #15
0
def update(archive_dir, dataset=None):
    """
    Download all sources into an archive directory. If dataset parameter
    is provided only sources for that dataset will be fetched (otherwise
    all source in the database will be fetched)
    """

    # Create archive directory if it doesn't exist
    if not os.path.isdir(archive_dir):
        os.makedirs(archive_dir)

    # If a dataset is provided we limit to only its sources (else we take all)
    sources = Source.all() if dataset is None else dataset.sources

    # Update each source
    for source in sources:
        update_source(archive_dir, source)
Example #16
0
def update(archive_dir, dataset=None):
    """
    Download all sources into an archive directory. If dataset parameter
    is provided only sources for that dataset will be fetched (otherwise
    all source in the database will be fetched)
    """

    # Create archive directory if it doesn't exist
    if not os.path.isdir(archive_dir):
        os.makedirs(archive_dir)

    # If a dataset is provided we limit to only its sources (else we take all)
    sources = Source.all() if dataset is None else dataset.sources

    # Update each source
    for source in sources:
        update_source(archive_dir, source)
Example #17
0
def load_source(source_id, sample=False):
    from openspending.model import Source
    from openspending.importer import CSVImporter
    source = Source.by_id(source_id)
    if not source:
        log.error("No such source: %s", source_id)

    if not source.loadable:
        log.error("Dataset has no mapping.")
        return

    source.dataset.generate()
    importer = CSVImporter(source)
    if sample:
        importer.run(dry_run=True, max_lines=1000, max_errors=1000)
    else:
        importer.run()
        index_dataset.delay(source.dataset.name)
Example #18
0
def load_source(source_id, sample=False):
    from openspending.model import Source
    from openspending.importer import CSVImporter
    source = Source.by_id(source_id)
    if not source:
        log.error("No such source: %s", source_id)

    if not source.loadable:
        log.error("Dataset has no mapping.")
        return

    source.dataset.generate()
    importer = CSVImporter(source)
    if sample:
        importer.run(max_lines=1000, max_errors=1000)
    else:
        importer.run()
    index_dataset.delay(source.dataset.name)
Example #19
0
 def create(self, dataset):
     self._get_dataset(dataset)
     require.dataset.update(c.dataset)
     try:
         schema = source_schema()
         data = schema.deserialize(request.params)
         source = Source(c.dataset, c.account, data['url'])
         db.session.add(source)
         db.session.commit()
         analyze_source.apply_async(args=[source.id], countdown=2)
         h.flash_success(_("The source has been created."))
         redirect(
             h.url_for(controller='editor',
                       action='index',
                       dataset=c.dataset.name))
     except Invalid as i:
         errors = i.asdict()
         errors = [(k[len('source.'):], v) for k, v in errors.items()]
         return self.new(dataset, dict(errors))
Example #20
0
    def output_json(**args):
        """ Output JSON data  """
        outputfile = args.get('outputfile', None)

        if len(outputfile) != 1:
            print "You need to specific one and only one output file"
            return

        outputfile = outputfile[0]

        #need to load in this order for relations
            #metadataorg
            #dataorg
            #source
            #sourcefile
                #wrap up files
            #dataset

        outputobj = []

        for metadataorg in MetadataOrg.all().all():
            outputobj.append(metadataorg.to_json_dump())

        for dataorg in DataOrg.all().all():
            outputobj.append(dataorg.to_json_dump())

        for source in Source.all().all():
            outputobj.append(source.to_json_dump())

        for sourcefile in SourceFile.all().all():

            outputobj.append(sourcefile.to_json_dump())

        for dataset in Dataset.all().all():
            outputobj.append(dataset.to_json_dump())



        with open(outputfile, 'wb') as f:
            json.dump(outputobj, f)

        print "success"
        print "written to ", outputfile
Example #21
0
    def output_json(**args):
        """ Output JSON data  """
        outputfile = args.get('outputfile', None)

        if len(outputfile) != 1:
            print "You need to specific one and only one output file"
            return

        outputfile = outputfile[0]

        #need to load in this order for relations
        #metadataorg
        #dataorg
        #source
        #sourcefile
        #wrap up files
        #dataset

        outputobj = []

        for metadataorg in MetadataOrg.all().all():
            outputobj.append(metadataorg.to_json_dump())

        for dataorg in DataOrg.all().all():
            outputobj.append(dataorg.to_json_dump())

        for source in Source.all().all():
            outputobj.append(source.to_json_dump())

        for sourcefile in SourceFile.all().all():

            outputobj.append(sourcefile.to_json_dump())

        for dataset in Dataset.all().all():
            outputobj.append(dataset.to_json_dump())

        with open(outputfile, 'wb') as f:
            json.dump(outputobj, f)

        print "success"
        print "written to ", outputfile
Example #22
0
    if len(request.files) == 1:
        upload_source_path = sourcefiles.save(request.files['sourcefile'])

        sourcefile = SourceFile(rawfile=upload_source_path)
        db.session.add(sourcefile)

        if basesource:
            if basesource.rawfile:
                basesource.rawfile.delete()
            basesource.rawfile = sourcefile
            source = basesource
            source.reload_openrefine()
        else:
            source = Source(dataset=dataset,
                            name=data['name'],
                            url=None,
                            rawfile=sourcefile)
            db.session.add(source)

        #handle file
    elif data.get('url', None):
        if basesource:
            source = basesource
            source.name = data['name']
            source.url = data['url']
            source.reload_openrefine()
            #maybe reload the OpenRefine?
            #trigger reload
        else:
            source = Source(dataset=dataset,
                            name=data['name'],
Example #23
0
            orig_filepath = os.path.join(file_dir, filename)

            with codecs.open(orig_filepath, 'rb') as fh:
                wuezfile = FileStorage(stream=fh)
                #upload_source_path = sourcefiles.save(wuezfile, name=filename, folder=UPLOADED_FILES_DEST)
                upload_source_path = sourcefiles.save(wuezfile, name=filename)
                sourcefile = SourceFile(rawfile=upload_source_path)
                db.session.add(sourcefile)
        except Exception, e:
            print "!!!!!Error failed", e
            return (None, False)
        try:
            print sourcefile
            source = Source(dataset=dataset,
                            name=dataset.name,
                            url=None,
                            rawfile=sourcefile)
        except Exception, e:
            traceback.print_exc(e)
            print "Could not load source rawfile", e
            return (None, False)

    else:
        try:
            source = Source(dataset=dataset,
                            name=dataset.name,
                            url=sourcejson['fields']['webservice'],
                            rawfile=None)

        except Exception, e:
            print "Could not load source webservice", e
Example #24
0
def get_source(sourcename):
    source = obj_or_404(Source.by_source_name(sourcename))
    #require.dataset.read(dataset)
    return source
Example #25
0
        upload_source_path = sourcefiles.save(request.files['sourcefile'])

        sourcefile = SourceFile(rawfile=upload_source_path)
        db.session.add(sourcefile)
        oroperations = None

        if basesource:
            if basesource.rawfile:
                basesource.rawfile.delete()
            basesource.rawfile = sourcefile
            source = basesource
            oroperations = source.getORInstructions()
            source.reload_openrefine()
        else:
            source = Source(dataset=dataset,
                            name=data['name'],
                            url=None,
                            rawfile=sourcefile)
            db.session.add(source)

        #handle file
    elif data.get('url', None):
        if basesource:
            source = basesource
            source.name = data['name']
            source.url = data['url']
            oroperations = source.getORInstructions()
            source.reload_openrefine()
            #maybe reload the OpenRefine?
            #trigger reload
        else:
            source = Source(dataset=dataset,
Example #26
0
 def _get_source(self, dataset, id):
     self._get_dataset(dataset)
     c.source = Source.by_id(id)
     if c.source is None or c.source.dataset != c.dataset:
         abort(404, _("There is no source '%s'") % id)
Example #27
0
def update(archive_dir):
    if not os.path.isdir(archive_dir):
        os.makedirs(archive_dir)
    for source in Source.all():
        update_source(archive_dir, source)
Example #28
0
 def _get_source(self, dataset, id):
     self._get_dataset(dataset)
     c.source = Source.by_id(id)
     if c.source is None or c.source.dataset != c.dataset:
         abort(404, _("There is no source '%s'") % id)
Example #29
0
        upload_source_path = sourcefiles.save(request.files['sourcefile'])


        sourcefile = SourceFile(rawfile = upload_source_path)
        db.session.add(sourcefile)
        oroperations = None

        if basesource:
            if basesource.rawfile:
                basesource.rawfile.delete()
            basesource.rawfile = sourcefile
            source = basesource
            oroperations = source.getORInstructions()
            source.reload_openrefine()
        else:
            source = Source(dataset=dataset, name=data['name'], url=None, rawfile=sourcefile)
            db.session.add(source)

        #handle file
    elif data.get('url', None):
        if basesource:
            source = basesource
            source.name = data['name']
            source.url = data['url']
            oroperations = source.getORInstructions()
            source.reload_openrefine()
            #maybe reload the OpenRefine?
            #trigger reload
        else:
            source = Source(dataset=dataset, name=data['name'], url=data['url'])
            db.session.add(source)
Example #30
0
    if len(request.files) == 1:
        upload_source_path = sourcefiles.save(request.files['sourcefile'])


        sourcefile = SourceFile(rawfile = upload_source_path)
        db.session.add(sourcefile)

        if basesource:
            if basesource.rawfile:
                basesource.rawfile.delete()
            basesource.rawfile = sourcefile
            source = basesource
            source.reload_openrefine()
        else:
            source = Source(dataset=dataset, name=data['name'], url=None, rawfile=sourcefile)
            db.session.add(source)

        #handle file
    elif data.get('url', None):
        if basesource:
            source = basesource
            source.name = data['name']
            source.url = data['url']
            source.reload_openrefine()
            #maybe reload the OpenRefine?
            #trigger reload
        else:
            source = Source(dataset=dataset, name=data['name'], url=data['url'])
            db.session.add(source)
    else:
Example #31
0
def update(archive_dir):
    if not os.path.isdir(archive_dir):
        os.makedirs(archive_dir)
    for source in Source.all():
        update_source(archive_dir, source)
Example #32
0
    try:
        log.info("Validating model")
        model = validate_model(model)
    except Invalid, i:
        log.error("Errors occured during model validation:")
        for field, error in i.asdict().items():
            log.error("%s: %s", field, error)
        return 1

    dataset = Dataset.by_name(model['dataset']['name'])
    if dataset is None:
        dataset = Dataset(model)
        db.session.add(dataset)
    log.info("Dataset: %s", dataset.name)

    source = Source(dataset, shell_account(), csv_data_url)
    for source_ in dataset.sources:
        if source_.url == csv_data_url:
            source = source_
            break
    db.session.add(source)
    db.session.commit()

    dataset.generate()
    importer = CSVImporter(source)
    importer.run(**vars(args))
    return 0


def _csvimport(args):
    return csvimport(args.dataset_url, args)
Example #33
0
def get_source(sourcename):
    source = obj_or_404(Source.by_source_name(sourcename))
    #require.dataset.read(dataset)
    return source