Ejemplo n.º 1
0
def import_csv(dataset, url, args):
    """
    Import the csv data into the dataset
    """

    csv_data_url, source_url = url
    source = Source(dataset, shell_account(), csv_data_url)
    # Analyse the csv data and add it to the source
    # If we don't analyse it we'll be left with a weird message
    source.analysis = analyze_csv(csv_data_url)
    # Check to see if the dataset already has this source
    for source_ in dataset.sources:
        if source_.url == csv_data_url:
            source = source_
            break
    db.session.add(source)
    db.session.commit()

    dataset.generate()
    importer = CSVImporter(source)
    importer.run(**vars(args))

    # Check if imported from the file system (source and data url differ)
    if csv_data_url != source_url:
        # If we did, then we must update the source url based on the
        # sources in the dataset model (so we need to fetch the source again
        # or else we'll add a new one)
        source = Source.by_id(source.id)
        source.url = source_url
        db.session.commit()
Ejemplo n.º 2
0
def import_csv(dataset, url, args):
    """
    Import the csv data into the dataset
    """

    csv_data_url, source_url = url
    source = Source(dataset, shell_account(),
                    csv_data_url)
    # Analyse the csv data and add it to the source
    # If we don't analyse it we'll be left with a weird message
    source.analysis = analyze_csv(csv_data_url)
    # Check to see if the dataset already has this source
    for source_ in dataset.sources:
        if source_.url == csv_data_url:
            source = source_
            break
    db.session.add(source)
    db.session.commit()

    dataset.generate()
    importer = CSVImporter(source)
    importer.run(**vars(args))

    # Check if imported from the file system (source and data url differ)
    if csv_data_url != source_url:
        # If we did, then we must update the source url based on the
        # sources in the dataset model (so we need to fetch the source again
        # or else we'll add a new one)
        source = Source.by_id(source.id)
        source.url = source_url
        db.session.commit()
Ejemplo n.º 3
0
 def test_dimensions_edit_mask_with_data(self):
     cra = Dataset.by_name('cra')
     src = Source(cra, self.user, 'file:///dev/null')
     src.analysis = {'columns': ['amount', 'etc']}
     db.session.add(src)
     db.session.commit()
     response = self.app.get(url(controller='editor',
                                 action='dimensions_edit', dataset='cra'),
                             extra_environ={'REMOTE_USER': '******'})
     assert 'cannot edit dimensions' in response.body
     assert '"amount"' not in response.body
     assert 'Update' not in response.body
Ejemplo n.º 4
0
 def test_dimensions_edit_mask_with_data(self):
     cra = Dataset.by_name('cra')
     src = Source(cra, self.user, 'file:///dev/null')
     src.analysis = {'columns': ['amount', 'etc']}
     db.session.add(src)
     db.session.commit()
     response = self.app.get(url(controller='editor',
                                 action='dimensions_edit',
                                 dataset='cra'),
                             extra_environ={'REMOTE_USER': '******'})
     assert 'cannot edit dimensions' in response.body
     assert '"amount"' not in response.body
     assert 'Update' not in response.body
Ejemplo n.º 5
0
    def create(self):
        """
        Adds a new dataset dynamically through a POST request
        """

        # User must be authenticated so we should have a user object in
        # c.account, if not abort with error message
        if not c.account:
            abort(status_code=400, detail='user not authenticated')

        # Check if the params are there ('metadata', 'csv_file')
        if len(request.params) != 2:
            abort(status_code=400, detail='incorrect number of params')

        metadata = request.params['metadata'] \
            if 'metadata' in request.params \
            else abort(status_code=400, detail='metadata is missing')

        csv_file = request.params['csv_file'] \
            if 'csv_file' in request.params \
            else abort(status_code=400, detail='csv_file is missing')

        # We proceed with the dataset
        try:
            model = json.load(urllib2.urlopen(metadata))
        except:
            abort(status_code=400, detail='JSON model could not be parsed')
        try:
            log.info("Validating model")
            model = validate_model(model)
        except Invalid as i:
            log.error("Errors occured during model validation:")
            for field, error in i.asdict().items():
                log.error("%s: %s", field, error)
            abort(status_code=400, detail='Model is not well formed')
        dataset = Dataset.by_name(model['dataset']['name'])
        if dataset is None:
            dataset = Dataset(model)
            require.dataset.create()
            dataset.managers.append(c.account)
            dataset.private = True  # Default value
            db.session.add(dataset)
        else:
            require.dataset.update(dataset)

        log.info("Dataset: %s", dataset.name)
        source = Source(dataset=dataset, creator=c.account, url=csv_file)

        log.info(source)
        for source_ in dataset.sources:
            if source_.url == csv_file:
                source = source_
                break
        db.session.add(source)
        db.session.commit()

        # Send loading of source into celery queue
        load_source.delay(source.id)
        return to_jsonp(dataset_apply_links(dataset.as_dict()))
Ejemplo n.º 6
0
def get_run(dataset, source, id):
    dataset = get_dataset(dataset)
    source = obj_or_404(Source.by_id(source))
    if source.dataset != dataset:
        raise BadRequest("There was no source")
    run = obj_or_404(Run.by_id(id))
    if run.source != source:
        raise BadRequest("There is no run %s" % str(id))
    return dataset, source, run
Ejemplo n.º 7
0
 def _get_run(self, dataset, source, id):
     self._get_dataset(dataset)
     require.dataset.update(c.dataset)
     c.source = Source.by_id(source)
     if c.source is None or c.source.dataset != c.dataset:
         abort(404, _("There is no source '%s'") % source)
     c.run = Run.by_id(id)
     if c.run is None or c.run.source != c.source:
         abort(404, _("There is no run '%s'") % id)
Ejemplo n.º 8
0
 def _get_run(self, dataset, source, id):
     self._get_dataset(dataset)
     require.dataset.update(c.dataset)
     c.source = Source.by_id(source)
     if c.source is None or c.source.dataset != c.dataset:
         abort(404, _("There is no source '%s'") % source)
     c.run = Run.by_id(id)
     if c.run is None or c.run.source != c.source:
         abort(404, _("There is no run '%s'") % id)
Ejemplo n.º 9
0
def get_run(dataset, source, id):
    dataset = get_dataset(dataset)
    require.dataset.update(dataset)
    source = obj_or_404(Source.by_id(source))
    if source.dataset != dataset:
        raise BadRequest("There was no source")
    run = obj_or_404(Run.by_id(id))
    if run.source != source:
        raise BadRequest("There is no run '" + str(id) + '")
    return dataset, source, run
Ejemplo n.º 10
0
 def test_view_source(self):
     url_ = 'http://banana.com/split.csv'
     source = Source(self.dataset, self.user, url_)
     db.session.add(source)
     db.session.commit()
     response = self.app.get(url(controller='source',
                                 action='view',
                                 dataset='cra',
                                 id=source.id),
                             extra_environ={'REMOTE_USER': '******'})
     assert response.headers['Location'] == url_, response.headers
Ejemplo n.º 11
0
def check_column(source_id, columnkey, columnvalue):
    with flask_app.app_context():
        source = Source.by_id(source_id)
        sourcerefine = source.get_or_create_ORProject()
        #should cache this at some point
        sourcefile_export = sourcerefine.refineproj.export()
        #remove BOM from the source file
        s = sourcefile_export.read()
        u = s.decode("utf-8-sig")
        sourcefile = io.BytesIO()
        sourcefile.write(str(u))
        sourcefile_csv = csv.DictReader(sourcefile, delimiter="\t")

        arrayset = []
        for row in sourcefile_csv:
            print row[columnvalue]
            arrayset.append(row[columnvalue])

        sourcefile.close()

        returnval = {"errors": [], "message": "There was an unexpected error"}

        if columnkey == "country_level0":
            temp_geom_countries = db.session.query("country").from_statement(
                text(
                    "SELECT geometry__country_level0.label as country FROM public.geometry__country_level0 "
                )).all()
            geom_countries = [y for x in temp_geom_countries for y in x]
            temp_geom_countries = None

            returnval['message'] = "The following countries were not found:"

            for country in arrayset:
                #there is probably a better method that takes advantage of a sorted list
                if country not in geom_countries:
                    #log as error
                    returnval['errors'].append(country)

        elif columnkey == "time":
            returnval['message'] = "Could not parse the following dates:"
            for date_col in arrayset:
                try:
                    parse(date_col)
                except Exception, e:
                    returnval['errors'].append(date_col)

        elif columnkey == "indicatorvalue":
            returnval['message'] = "Could not parse the following values: "
            for val_col in arrayset:
                try:
                    float(val_col)
                except:
                    returnval['errors'].append(val_col)
Ejemplo n.º 12
0
    def load_with_model_and_csv(self, metadata, csv_file, private):
        """
        Load a dataset using a metadata model file and a csv file
        """

        if metadata is None:
            response.status = 400
            return to_jsonp({'errors': 'metadata is missing'})

        if csv_file is None:
            response.status = 400
            return to_jsonp({'errors': 'csv_file is missing'})

        # We proceed with the dataset
        try:
            model = json.load(urllib2.urlopen(metadata))
        except:
            response.status = 400
            return to_jsonp({'errors': 'JSON model could not be parsed'})
        try:
            log.info("Validating model")
            model = validate_model(model)
        except Invalid as i:
            log.error("Errors occured during model validation:")
            for field, error in i.asdict().items():
                log.error("%s: %s", field, error)
            response.status = 400
            return to_jsonp({'errors': 'Model is not well formed'})
        dataset = Dataset.by_name(model['dataset']['name'])
        if dataset is None:
            dataset = Dataset(model)
            require.dataset.create()
            dataset.managers.append(c.account)
            dataset.private = private
            db.session.add(dataset)
        else:
            require.dataset.update(dataset)

        log.info("Dataset: %s", dataset.name)
        source = Source(dataset=dataset, creator=c.account, url=csv_file)

        log.info(source)
        for source_ in dataset.sources:
            if source_.url == csv_file:
                source = source_
                break
        db.session.add(source)
        db.session.commit()

        # Send loading of source into celery queue
        load_source.delay(source.id)
        return to_jsonp(dataset_apply_links(dataset.as_dict()))
Ejemplo n.º 13
0
def csvimport_fixture(name):
    model_fp = csvimport_fixture_file(name, 'model.json')
    mapping_fp = csvimport_fixture_file(name, 'mapping.json')
    model = json.load(model_fp)
    if mapping_fp:
        model['mapping'] = json.load(mapping_fp)
    dataset = Dataset(model)
    dataset.generate()
    db.session.add(dataset)
    data_path = csvimport_fixture_path(name, 'data.csv')
    user = make_account()
    source = Source(dataset, user, data_path)
    db.session.add(source)
    db.session.commit()
    return source
Ejemplo n.º 14
0
def analyze_source(source_id):
    from openspending.model import meta as db
    from openspending.model.source import Source
    from openspending.importer.analysis import analyze_csv
    source = Source.by_id(source_id)
    if not source:
        log.error("No such source: %s", source_id)
        return
    log.info("Analyzing: %s", source.url)
    source.analysis = analyze_csv(source.url)
    if 'error' in source.analysis:
        log.error(source.analysis.get('error'))
    else:
        log.info("Columns: %r", source.analysis.get('columns'))
    db.session.commit()
Ejemplo n.º 15
0
def check_column(source_id, columnkey, columnvalue):
    # with flask_app.app_context():
    source = Source.by_id(source_id)
    sourcerefine = source.get_or_create_ORProject()
    # should cache this at some point
    sourcefile_export = sourcerefine.refineproj.export()
    # remove BOM from the source file
    s = sourcefile_export.read()
    u = s.decode("utf-8-sig")
    sourcefile = io.BytesIO()
    sourcefile.write(str(u))
    sourcefile_csv = csv.DictReader(sourcefile, delimiter="\t")

    arrayset = []
    for row in sourcefile_csv:
        print row[columnvalue]
        arrayset.append(row[columnvalue])

    sourcefile.close()

    returnval = {"errors": [], "message": "There was an unexpected error"}

    if columnkey == "country_level0":
        temp_geom_countries = (
            db.session.query("country")
            .from_statement(
                text("SELECT geometry__country_level0.label as country FROM public.geometry__country_level0 ")
            )
            .all()
        )
        geom_countries = [y for x in temp_geom_countries for y in x]
        temp_geom_countries = None

        returnval["message"] = "The following countries were not found:"

        for country in arrayset:
            # there is probably a better method that takes advantage of a sorted list
            if country not in geom_countries:
                # log as error
                returnval["errors"].append(country)

    elif columnkey == "time":
        returnval["message"] = "Could not parse the following dates:"
        for date_col in arrayset:
            try:
                parse(date_col)
            except Exception, e:
                returnval["errors"].append(date_col)
Ejemplo n.º 16
0
def update(archive_dir, dataset=None):
    """
    Download all sources into an archive directory. If dataset parameter
    is provided only sources for that dataset will be fetched (otherwise
    all source in the database will be fetched)
    """

    # Create archive directory if it doesn't exist
    if not os.path.isdir(archive_dir):
        os.makedirs(archive_dir)

    # If a dataset is provided we limit to only its sources (else we take all)
    sources = Source.all() if dataset is None else dataset.sources

    # Update each source
    for source in sources:
        update_source(archive_dir, source)
Ejemplo n.º 17
0
def update(archive_dir, dataset=None):
    """
    Download all sources into an archive directory. If dataset parameter
    is provided only sources for that dataset will be fetched (otherwise
    all source in the database will be fetched)
    """

    # Create archive directory if it doesn't exist
    if not os.path.isdir(archive_dir):
        os.makedirs(archive_dir)

    # If a dataset is provided we limit to only its sources (else we take all)
    sources = Source.all() if dataset is None else dataset.sources

    # Update each source
    for source in sources:
        update_source(archive_dir, source)
Ejemplo n.º 18
0
def load_source(source_id, sample=False):
    from openspending.model.source import Source
    from openspending.importer import CSVImporter
    source = Source.by_id(source_id)
    if not source:
        log.error("No such source: %s", source_id)

    if not source.loadable:
        log.error("Dataset has no mapping.")
        return

    source.dataset.generate()
    importer = CSVImporter(source)
    if sample:
        importer.run(dry_run=True, max_lines=1000, max_errors=1000)
    else:
        importer.run()
        index_dataset.delay(source.dataset.name)
Ejemplo n.º 19
0
 def create(self, dataset):
     self._get_dataset(dataset)
     require.dataset.update(c.dataset)
     try:
         schema = source_schema()
         data = schema.deserialize(request.params)
         source = Source(c.dataset, c.account, data['url'])
         db.session.add(source)
         db.session.commit()
         analyze_source.apply_async(args=[source.id], countdown=2)
         h.flash_success(_("The source has been created."))
         redirect(
             h.url_for(controller='editor',
                       action='index',
                       dataset=c.dataset.name))
     except Invalid as i:
         errors = i.asdict()
         errors = [(k[len('source.'):], v) for k, v in errors.items()]
         return self.new(dataset, dict(errors))
Ejemplo n.º 20
0
def load_source(source_id, sample=False):
    with flask_app.app_context():
        source = Source.by_id(source_id)
        if not source:
            return log.error("No such source: %s", source_id)

        if not source.dataset.mapping:
            return log.error("Dataset has no mapping.")

        #we should drop this first to make sure everything loads corrctly
        source.model.drop()

        source.model.generate()

        importer = ORImporter(source)
        if sample:
            importer.run(dry_run=True, max_lines=1000, max_errors=1000)
        else:
            importer.run()
Ejemplo n.º 21
0
def load_source(source_id, sample=False):
    # with flask_app.app_context():
    source = Source.by_id(source_id)
    if not source:
        return log.error("No such source: %s", source_id)

    if not source.dataset.mapping:
        return log.error("Dataset has no mapping.")

    # we should drop this first to make sure everything loads corrctly
    source.model.drop()

    source.model.generate()

    importer = ORImporter(source)
    if sample:
        importer.run(dry_run=True, max_lines=1000, max_errors=1000)
    else:
        importer.run()
Ejemplo n.º 22
0
def load_budgetdatapackage(source_id, sample=False):
    """
    Same as the CSV importer except that it uses the BudgetDataPackage
    importer instead of the CSVImporter
    """
    from openspending.model.source import Source
    from openspending.importer import BudgetDataPackageImporter

    source = Source.by_id(source_id)
    if not source:
        log.error("No such source: %s", source_id)

    if not source.loadable:
        log.error("Dataset has no mapping.")
        return

    source.dataset.generate()
    importer = BudgetDataPackageImporter(source)
    if sample:
        importer.run(dry_run=True, max_lines=1000, max_errors=1000)
    else:
        importer.run()
        index_dataset.delay(source.dataset.name)
Ejemplo n.º 23
0
def create_budget_data_package(url, user, private):
    try:
        bdpkg = BudgetDataPackage(url)
    except Exception as problem:
        # Lots of different types of problems can arise with a
        # BudgetDataPackage, but their message should be understandable
        # so we catch just any Exception and email it's message to the user
        log.error("Failed to parse budget data package: {0}".format(
            problem.message))
        return []

    sources = []
    for (idx, resource) in enumerate(bdpkg.resources):
        dataset = Dataset.by_name(bdpkg.name)
        if dataset is None:
            # Get information from the descriptior file for the given
            # resource (at index idx)
            info = get_dataset_info_from_descriptor(bdpkg, idx)
            # Set the dataset name based on the previously computed one
            info['dataset']['name'] = bdpkg.name
            # Create the model from the resource schema
            model = create_model_from_schema(resource.schema)
            # Set the default value for the time to the fiscal year of the
            # resource, because it isn't included in the budget CSV so we
            # won't be able to load it along with the data.
            model['time']['default_value'] = resource.fiscalYear
            # Add the model as the mapping
            info['mapping'] = model

            # Create the dataset
            dataset = Dataset(info)
            dataset.managers.append(user)
            dataset.private = private
            db.session.add(dataset)
            db.session.commit()
        else:
            if not dataset.can_update(user):
                log.error(
                    "User {0} not permitted to update dataset {1}".format(
                        user.name, bdpkg.name))
                return []

        if 'url' in resource:
            resource_url = resource.url
        elif 'path' in resource:
            if 'base' in bdpkg:
                resource_url = urlparse.urljoin(bdpkg.base, resource.path)
            else:
                resource_url = urlparse.urljoin(url, resource.path)
        else:
            log.error('Url not found')
            return []

        # We do not re-add old sources so if we find the same source
        # we don't do anything, else we create the source and append it
        # to the source list
        for dataset_source in dataset.sources:
            if dataset_source.url == resource_url:
                break
        else:
            source = Source(dataset=dataset, creator=user, url=resource_url)
            db.session.add(source)
            db.session.commit()
            sources.append(source)

    return sources
Ejemplo n.º 24
0
    def create(self):
        """
        Adds a new dataset dynamically through a POST request
        """

        # User must be authenticated so we should have a user object in
        # c.account, if not abort with error message
        if not c.account:
            abort(status_code=400, detail='user not authenticated')

        # Parse the loading api parameters to get them into the right format
        parser = LoadingAPIParamParser(request.params)
        params, errors = parser.parse()

        if errors:
            response.status = 400
            return to_jsonp({'errors': errors})

        if params['metadata'] is None:
            response.status = 400
            return to_jsonp({'errors': 'metadata is missing'})

        if params['csv_file'] is None:
            response.status = 400
            return to_jsonp({'errors': 'csv_file is missing'})

        # We proceed with the dataset
        try:
            model = json.load(urllib2.urlopen(params['metadata']))
        except:
            response.status = 400
            return to_jsonp({'errors': 'JSON model could not be parsed'})
        try:
            log.info("Validating model")
            model = validate_model(model)
        except Invalid as i:
            log.error("Errors occured during model validation:")
            for field, error in i.asdict().items():
                log.error("%s: %s", field, error)
            response.status = 400
            return to_jsonp({'errors': 'Model is not well formed'})
        dataset = Dataset.by_name(model['dataset']['name'])
        if dataset is None:
            dataset = Dataset(model)
            require.dataset.create()
            dataset.managers.append(c.account)
            dataset.private = params['private']
            db.session.add(dataset)
        else:
            require.dataset.update(dataset)

        log.info("Dataset: %s", dataset.name)
        source = Source(dataset=dataset,
                        creator=c.account,
                        url=params['csv_file'])

        log.info(source)
        for source_ in dataset.sources:
            if source_.url == params['csv_file']:
                source = source_
                break
        db.session.add(source)
        db.session.commit()

        # Send loading of source into celery queue
        load_source.delay(source.id)
        return to_jsonp(dataset_apply_links(dataset.as_dict()))
Ejemplo n.º 25
0
 def _get_source(self, dataset, id):
     self._get_dataset(dataset)
     c.source = Source.by_id(id)
     if c.source is None or c.source.dataset != c.dataset:
         abort(404, _("There is no source '%s'") % id)
Ejemplo n.º 26
0
 def _get_source(self, dataset, id):
     self._get_dataset(dataset)
     c.source = Source.by_id(id)
     if c.source is None or c.source.dataset != c.dataset:
         abort(404, _("There is no source '%s'") % id)