def _detail_aggregate(args): """Returns a record for every row in the specified dataset with brief temporal and spatial information about the row. This can give a user of the platform a quick overview about what is available within their constraints. :param args: dictionary of request arguments :returns: csv or json response object""" meta_params = ('obs_date__ge', 'obs_date__le', 'agg', 'geom', 'dataset') meta_vals = (args.data.get(k) for k in meta_params) start_date, end_date, agg, geom, dataset = meta_vals time_counts = [] if not has_tree_filters(args.data): # The obs_date arguments set the bounds of all the aggregates. # We don't want to create a condition tree that has point_date filters. args.data[dataset.name + '__filter'] = request_args_to_condition_tree( args.data, ignore=['obs_date__ge', 'obs_date__le'] ) dataset_conditions = {k: v for k, v in args.data.items() if 'filter' in k} for tablename, condition_tree in dataset_conditions.items(): # This pattern matches the last occurrence of the '__' pattern. # Prevents an error that is caused by dataset names with trailing # underscores. tablename = re.split(r'__(?!_)', tablename)[0] table = MetaTable.get_by_dataset_name(tablename).point_table try: conditions = parse_tree(table, condition_tree) except ValueError: # Catches empty condition tree. conditions = None try: ts = MetaTable.get_by_dataset_name(table.name).timeseries_one( agg, start_date, end_date, geom, conditions ) except Exception as e: return internal_error('Failed to construct timeseries', e) time_counts += [{'count': c, 'datetime': d} for c, d in ts[1:]] resp = None datatype = args.data['data_type'] if datatype == 'json': resp = json_response_base(args, time_counts, request.args) resp['count'] = sum([c['count'] for c in time_counts]) resp = make_response(json.dumps(resp, default=unknown_object_json_handler), 200) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': resp = form_csv_detail_response(['point_date', 'hash'], time_counts) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Disposition'] = 'attachment; filename=%s.csv' % filedate return resp
def setUp(self): session.rollback() # Ensure we have metadata loaded into the database # to mimic the behavior of metadata ingestion preceding file ingestion. drop_meta('dog_park_permits') drop_meta('community_radio_events') drop_meta('public_opera_performances') # Make new MetaTable objects self.unloaded_meta = MetaTable(url='nightvale.gov/events.csv', human_name='Community Radio Events', business_key='Event Name', observed_date='Date', latitude='lat', longitude='lon', approved_status=True) self.existing_meta = MetaTable(url='nightvale.gov/dogpark.csv', human_name='Dog Park Permits', business_key='Hooded Figure ID', observed_date='Date', latitude='lat', longitude='lon', approved_status=False) self.opera_meta = MetaTable(url='nightvale.gov/opera.csv', human_name='Public Opera Performances', business_key='Event Name', observed_date='Date', location='Location', approved_status=False) session.add_all( [self.existing_meta, self.opera_meta, self.unloaded_meta]) session.commit() # Also, let's have one table pre-loaded... self.existing_table = sa.Table( 'dog_park_permits', MetaData(), Column('hooded_figure_id', Integer), Column('point_date', TIMESTAMP, nullable=False), Column('date', Date, nullable=True), Column('lat', Float, nullable=False), Column('lon', Float, nullable=False), Column('hash', String(32), primary_key=True), Column('geom', Geometry('POINT', srid=4326), nullable=True)) drop_if_exists(self.existing_table.name) self.existing_table.create(bind=app_engine) # ... with some pre-existing data ins = self.existing_table.insert().values( hooded_figure_id=1, point_date=date(2015, 1, 2), lon=-87.6495076896, lat=41.7915865543, geom=None, hash='addde9be7f59e95fc08e54e29b2a947f') app_engine.execute(ins)
def _grid(args): meta_params = ('dataset', 'geom', 'resolution', 'buffer', 'obs_date__ge', 'obs_date__le') meta_vals = (args.data.get(k) for k in meta_params) point_table, geom, resolution, buffer_, obs_date__ge, obs_date__le = meta_vals result_rows = [] if not has_tree_filters(args.data): tname = point_table.name args.data[tname + '__filter'] = request_args_to_condition_tree( request_args=args.data, ignore=['buffer', 'resolution'] ) # We only build conditions from values with a key containing 'filter'. # Therefore we only build dataset conditions from condition trees. dataset_conditions = {k: v for k, v in args.data.items() if 'filter' in k} for tablename, condition_tree in dataset_conditions.items(): tablename = tablename.split('__')[0] metatable = MetaTable.get_by_dataset_name(tablename) table = metatable.point_table conditions = parse_tree(table, condition_tree) try: registry_row = MetaTable.get_by_dataset_name(table.name) # make_grid expects conditions to be iterable. grid_rows, size_x, size_y = registry_row.make_grid( resolution, geom, [conditions], {'upper': obs_date__le, 'lower': obs_date__ge} ) result_rows += grid_rows except Exception as e: return internal_error('Could not make grid aggregation.', e) resp = geojson_response_base() for value in result_rows: if value[1]: pt = shapely.wkb.loads(value[1].decode('hex')) south, west = (pt.x - (size_x / 2)), (pt.y - (size_y / 2)) north, east = (pt.x + (size_x / 2)), (pt.y + (size_y / 2)) new_geom = shapely.geometry.box(south, west, north, east).__geo_interface__ else: new_geom = None new_property = {'count': value[0], } add_geojson_feature(resp, new_geom, new_property) resp = make_response(json.dumps(resp, default=date_json_handler), 200) resp.headers['Content-Type'] = 'application/json' return resp
def _grid(args): meta_params = ('dataset', 'geom', 'resolution', 'buffer', 'obs_date__ge', 'obs_date__le') meta_vals = (args.data.get(k) for k in meta_params) point_table, geom, resolution, buffer_, obs_date__ge, obs_date__le = meta_vals result_rows = [] if not has_tree_filters(args.data): tname = point_table.name args.data[tname + '__filter'] = request_args_to_condition_tree( request_args=args.data, ignore=['buffer', 'resolution']) # We only build conditions from values with a key containing 'filter'. # Therefore we only build dataset conditions from condition trees. dataset_conditions = {k: v for k, v in args.data.items() if 'filter' in k} for tablename, condition_tree in dataset_conditions.items(): tablename = tablename.rsplit('__')[0] metatable = MetaTable.get_by_dataset_name(tablename) table = metatable.point_table conditions = parse_tree(table, condition_tree) try: registry_row = MetaTable.get_by_dataset_name(table.name) # make_grid expects conditions to be iterable. grid_rows, size_x, size_y = registry_row.make_grid( resolution, geom, [conditions], { 'upper': obs_date__le, 'lower': obs_date__ge }) result_rows += grid_rows except Exception as e: msg = 'Could not make grid aggregation.' return api_response.make_raw_error('{}: {}'.format(msg, e)) resp = api_response.geojson_response_base() for value in result_rows: if value[1]: pt = shapely.wkb.loads(codecs.decode(value[1], 'hex')) south, west = (pt.x - (size_x / 2)), (pt.y - (size_y / 2)) north, east = (pt.x + (size_x / 2)), (pt.y + (size_y / 2)) new_geom = shapely.geometry.box(south, west, north, east).__geo_interface__ else: new_geom = None new_property = { 'count': value[0], } api_response.add_geojson_feature(resp, new_geom, new_property) return resp
def ingest_from_fixture(fixture_meta, fname): md = MetaTable(**fixture_meta) session.add(md) session.commit() path = os.path.join(fixtures_path, fname) point_etl = PlenarioETL(md, source_path=path) point_etl.add()
def validate_many_datasets(list_of_datasets): """Custom validator for dataset_name__in parameter.""" valid_tables = MetaTable.index() for dataset_name in list_of_datasets: if dataset_name not in valid_tables: raise ValidationError("Invalid table name: {}.".format(dataset_name))
def point_meta_from_submit_form(form, is_approved): columns, labels = form_columns(form) name = slugify(form['dataset_name'], delimiter='_')[:50] metatable = MetaTable( url=form['file_url'], view_url=form.get('view_url'), dataset_name=name, human_name=form['dataset_name'], attribution=form.get('dataset_attribution'), description=form.get('dataset_description'), update_freq=form['update_frequency'], contributor_name=form['contributor_name'], contributor_organization=form.get('contributor_organization'), contributor_email=form['contributor_email'], approved_status=is_approved, observed_date=labels['observed_date'], latitude=labels.get('latitude', None), longitude=labels.get('longitude', None), location=labels.get('location', None), column_names=columns) postgres_session.add(metatable) postgres_session.commit() return metatable
def detail_aggregate(): raw_query_params = request.args.copy() # First, make sure name of dataset was provided... try: dataset_name = raw_query_params.pop('dataset_name') except KeyError: return bad_request("'dataset_name' is required") # and that we have that dataset. try: validator = ParamValidator(dataset_name) except NoSuchTableError: return bad_request("Cannot find dataset named {}".format(dataset_name)) validator\ .set_optional('obs_date__ge', date_validator, datetime.now() - timedelta(days=90))\ .set_optional('obs_date__le', date_validator, datetime.now())\ .set_optional('location_geom__within', geom_validator, None)\ .set_optional('data_type', make_format_validator(['json', 'csv']), 'json')\ .set_optional('agg', agg_validator, 'week') # If any optional parameters are malformed, we're better off bailing and telling the user # than using a default and confusing them. err = validator.validate(raw_query_params) if err: return bad_request(err) start_date = validator.vals['obs_date__ge'] end_date = validator.vals['obs_date__le'] agg = validator.vals['agg'] geom = validator.get_geom() dataset = MetaTable.get_by_dataset_name(dataset_name) try: ts = dataset.timeseries_one(agg_unit=agg, start=start_date, end=end_date, geom=geom, column_filters=validator.conditions) except Exception as e: return internal_error('Failed to construct timeseries', e) datatype = validator.vals['data_type'] if datatype == 'json': time_counts = [{'count': c, 'datetime': d} for c, d in ts[1:]] resp = json_response_base(validator, time_counts) resp['count'] = sum([c['count'] for c in time_counts]) resp = make_response(json.dumps(resp, default=dthandler), 200) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': resp = make_csv(ts) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers[ 'Content-Disposition'] = 'attachment; filename=%s.csv' % filedate return resp
def _detail_aggregate(args): """Returns a record for every row in the specified dataset with brief temporal and spatial information about the row. This can give a user of the platform a quick overview about what is available within their constraints. :param args: dictionary of request arguments :returns: csv or json response object """ meta_params = ('obs_date__ge', 'obs_date__le', 'agg', 'geom', 'dataset') meta_vals = (args.data.get(k) for k in meta_params) start_date, end_date, agg, geom, dataset = meta_vals time_counts = [] if not has_tree_filters(args.data): # The obs_date arguments set the bounds of all the aggregates. # We don't want to create a condition tree that has point_date filters. args.data[dataset.name + '__filter'] = request_args_to_condition_tree( args.data, ignore=['obs_date__ge', 'obs_date__le']) dataset_conditions = { k: v for k, v in list(args.data.items()) if 'filter' in k } for tablename, condition_tree in list(dataset_conditions.items()): # This pattern matches the last occurrence of the '__' pattern. # Prevents an error that is caused by dataset names with trailing # underscores. tablename = re.split(r'__(?!_)', tablename)[0] table = MetaTable.get_by_dataset_name(tablename).point_table try: conditions = parse_tree(table, condition_tree) except ValueError: # Catches empty condition tree. conditions = None try: ts = MetaTable.get_by_dataset_name(table.name).timeseries_one( agg, start_date, end_date, geom, conditions) except Exception as e: msg = 'Failed to construct timeseries' return api_response.make_raw_error('{}: {}'.format(msg, e)) time_counts += [{'count': c, 'datetime': d} for c, d in ts[1:]] return time_counts
def grid(): raw_query_params = request.args.copy() # First, make sure name of dataset was provided... try: dataset_name = raw_query_params.pop('dataset_name') except KeyError: return bad_request("'dataset_name' is required") try: validator = ParamValidator(dataset_name) except NoSuchTableError: return bad_request("Could not find dataset named {}.".format(dataset_name)) validator.set_optional('buffer', int_validator, 100)\ .set_optional('resolution', int_validator, 500)\ .set_optional('location_geom__within', geom_validator, None)\ .set_optional('obs_date__ge', date_validator, datetime.now() - timedelta(days=90))\ .set_optional('obs_date__le', date_validator, datetime.now())\ err = validator.validate(raw_query_params) if err: return bad_request(err) # Part 2: Construct SQL query try: dset = validator.dataset maker = FilterMaker(validator.vals, dataset=dset) # Get time filters time_filters = maker.time_filters() # From user params, wither get None or requested geometry geom = validator.get_geom() except Exception as e: return internal_error('Could not make time and geometry filters.', e) resolution = validator.vals['resolution'] try: registry_row = MetaTable.get_by_dataset_name(dataset_name) grid_rows, size_x, size_y = registry_row.make_grid(resolution, geom, validator.conditions + time_filters) except Exception as e: return internal_error('Could not make grid aggregation.', e) resp = geojson_response_base() for value in grid_rows: if value[1]: pt = shapely.wkb.loads(value[1].decode('hex')) south, west = (pt.x - (size_x / 2)), (pt.y - (size_y /2)) north, east = (pt.x + (size_x / 2)), (pt.y + (size_y / 2)) new_geom = shapely.geometry.box(south, west, north, east).__geo_interface__ else: new_geom = None new_property = {'count': value[0],} add_geojson_feature(resp, new_geom, new_property) resp = make_response(json.dumps(resp, default=dthandler), 200) resp.headers['Content-Type'] = 'application/json' return resp
def _detail_aggregate(args): """Returns a record for every row in the specified dataset with brief temporal and spatial information about the row. This can give a user of the platform a quick overview about what is available within their constraints. :param args: dictionary of request arguments :returns: csv or json response object """ meta_params = ('obs_date__ge', 'obs_date__le', 'agg', 'geom', 'dataset') meta_vals = (args.data.get(k) for k in meta_params) start_date, end_date, agg, geom, dataset = meta_vals time_counts = [] if not has_tree_filters(args.data): # The obs_date arguments set the bounds of all the aggregates. # We don't want to create a condition tree that has point_date filters. args.data[dataset.name + '__filter'] = request_args_to_condition_tree( args.data, ignore=['obs_date__ge', 'obs_date__le'] ) dataset_conditions = {k: v for k, v in list(args.data.items()) if 'filter' in k} for tablename, condition_tree in list(dataset_conditions.items()): # This pattern matches the last occurrence of the '__' pattern. # Prevents an error that is caused by dataset names with trailing # underscores. tablename = re.split(r'__(?!_)', tablename)[0] table = MetaTable.get_by_dataset_name(tablename).point_table try: conditions = parse_tree(table, condition_tree) except ValueError: # Catches empty condition tree. conditions = None try: ts = MetaTable.get_by_dataset_name(table.name).timeseries_one( agg, start_date, end_date, geom, conditions ) except Exception as e: msg = 'Failed to construct timeseries' return api_response.make_raw_error('{}: {}'.format(msg, e)) time_counts += [{'count': c, 'datetime': d} for c, d in ts[1:]] return time_counts
def detail_aggregate(): raw_query_params = request.args.copy() # First, make sure name of dataset was provided... try: dataset_name = raw_query_params.pop('dataset_name') except KeyError: return bad_request("'dataset_name' is required") # and that we have that dataset. try: validator = ParamValidator(dataset_name) except NoSuchTableError: return bad_request("Cannot find dataset named {}".format(dataset_name)) validator\ .set_optional('obs_date__ge', date_validator, datetime.now() - timedelta(days=90))\ .set_optional('obs_date__le', date_validator, datetime.now())\ .set_optional('location_geom__within', geom_validator, None)\ .set_optional('data_type', make_format_validator(['json', 'csv']), 'json')\ .set_optional('agg', agg_validator, 'week') # If any optional parameters are malformed, we're better off bailing and telling the user # than using a default and confusing them. err = validator.validate(raw_query_params) if err: return bad_request(err) start_date = validator.vals['obs_date__ge'] end_date = validator.vals['obs_date__le'] agg = validator.vals['agg'] geom = validator.get_geom() dataset = MetaTable.get_by_dataset_name(dataset_name) try: ts = dataset.timeseries_one(agg_unit=agg, start=start_date, end=end_date, geom=geom, column_filters=validator.conditions) except Exception as e: return internal_error('Failed to construct timeseries', e) datatype = validator.vals['data_type'] if datatype == 'json': time_counts = [{'count': c, 'datetime': d} for c, d in ts[1:]] resp = json_response_base(validator, time_counts) resp['count'] = sum([c['count'] for c in time_counts]) resp = make_response(json.dumps(resp, default=dthandler), 200) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': resp = make_csv(ts) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Disposition'] = 'attachment; filename=%s.csv' % filedate return resp
def add_dataset_to_metatable(request, url, dataset_id, dataset_info, socrata_source, approved_status): data_types = [] business_key = None observed_date = None latitude = None longitude = None location = None for k, v in request.form.iteritems(): if k.startswith('data_type_'): key = k.replace("data_type_", "") data_types.append({"field_name": key, "data_type": v}) if k.startswith('key_type_'): key = k.replace("key_type_", "") if (v == "business_key"): business_key = key if (v == "observed_date"): observed_date = key if (v == "latitude"): latitude = key if (v == "longitude"): longitude = key if (v == "location"): location = key if socrata_source: data_types = dataset_info['columns'] url = dataset_info['source_url'] d = { 'dataset_name': slugify(request.form.get('dataset_name'), delim=u'_')[:50], 'human_name': request.form.get('dataset_name'), 'attribution': request.form.get('dataset_attribution'), 'description': request.form.get('dataset_description'), 'source_url': url, 'source_url_hash': dataset_id, 'update_freq': request.form.get('update_frequency'), 'business_key': business_key, 'observed_date': observed_date, 'latitude': latitude, 'longitude': longitude, 'location': location, 'contributor_name': request.form.get('contributor_name'), 'contributor_organization': request.form.get('contributor_organization'), 'contributor_email': request.form.get('contributor_email'), 'contributed_data_types': json.dumps(data_types), 'approved_status': approved_status, 'is_socrata_source': socrata_source } # add this to meta_master md = MetaTable(**d) session.add(md) session.commit() return md
def view_datasets(): datasets_pending = fetch_pending_tables(MetaTable) shapes_pending = fetch_pending_tables(ShapeMetadata) datasets = MetaTable.get_all_with_etl_status() shapesets = ShapeMetadata.get_all_with_etl_status() return render_template('admin/view-datasets.html', datasets_pending=datasets_pending, shapes_pending=shapes_pending, datasets=datasets, shape_datasets=shapesets)
def test_location_col_add(self): drop_if_exists(self.opera_meta.dataset_name) etl = PlenarioETL(self.opera_meta, source_path=self.opera_path) new_table = etl.add() all_rows = session.execute(new_table.select()).fetchall() self.assertEqual(len(all_rows), 5) session.close() new_table.drop(app_engine, checkfirst=True) # Did we add a bbox? bbox = MetaTable.get_by_dataset_name('public_opera_performances').bbox self.assertIsNotNone(bbox)
def test_location_col_add(self): drop_if_exists(self.opera_meta.dataset_name) etl = PlenarioETL(self.opera_meta, source_path=self.opera_path) new_table = etl.add() all_rows = postgres_session.execute(new_table.select()).fetchall() self.assertEqual(len(all_rows), 5) postgres_session.close() new_table.drop(postgres_engine, checkfirst=True) # Did we add a bbox? bbox = MetaTable.get_by_dataset_name('public_opera_performances').bbox self.assertIsNotNone(bbox)
def test_new_table(self): drop_if_exists(self.unloaded_meta.dataset_name) etl = PlenarioETL(self.unloaded_meta, source_path=self.radio_path) new_table = etl.add() all_rows = postgres_session.execute(new_table.select()).fetchall() self.assertEqual(len(all_rows), 5) postgres_session.close() new_table.drop(postgres_engine, checkfirst=True) # Did we add a bbox? bbox = MetaTable.get_by_dataset_name('community_radio_events').bbox self.assertIsNotNone(bbox)
def test_new_table(self): drop_if_exists(self.unloaded_meta.dataset_name) etl = PlenarioETL(self.unloaded_meta, source_path=self.radio_path) new_table = etl.add() all_rows = session.execute(new_table.select()).fetchall() self.assertEqual(len(all_rows), 5) session.close() new_table.drop(app_engine, checkfirst=True) # Did we add a bbox? bbox = MetaTable.get_by_dataset_name('community_radio_events').bbox self.assertIsNotNone(bbox)
def test_updates_index_and_validates_correctly(self): # Adds a MetaTable record. self.test_client.post('/add?is_shapefile=false', data=roadworks_post_data) meta = MetaTable.get_by_dataset_name('roadworks') # Creates the table. PlenarioETL(meta).add() # Perform a query on the newly added dataset (to check if the # validator allows the query through). query = '/v1/api/detail?dataset_name=roadworks&obs_date__ge=2000' response = self.test_client.get(query) data = json.loads(response.data) self.assertGreaterEqual(len(data['objects']), 100)
def grid(): raw_query_params = request.args.copy() # First, make sure name of dataset was provided... try: dataset_name = raw_query_params.pop('dataset_name') except KeyError: return bad_request("'dataset_name' is required") try: validator = ParamValidator(dataset_name) except NoSuchTableError: return bad_request( "Could not find dataset named {}.".format(dataset_name)) validator.set_optional('buffer', int_validator, 100)\ .set_optional('resolution', int_validator, 500)\ .set_optional('location_geom__within', geom_validator, None)\ .set_optional('obs_date__ge', date_validator, datetime.now() - timedelta(days=90))\ .set_optional('obs_date__le', date_validator, datetime.now())\ err = validator.validate(raw_query_params) if err: return bad_request(err) # Part 2: Construct SQL query try: dset = validator.dataset maker = FilterMaker(validator.vals, dataset=dset) # Get time filters time_filters = maker.time_filters() # From user params, wither get None or requested geometry geom = validator.get_geom() except Exception as e: return internal_error('Could not make time and geometry filters.', e) resolution = validator.vals['resolution'] try: registry_row = MetaTable.get_by_dataset_name(dataset_name) grid_rows, size_x, size_y = registry_row.make_grid( resolution, geom, validator.conditions + time_filters) except Exception as e: return internal_error('Could not make grid aggregation.', e) resp = geojson_response_base() for value in grid_rows: if value[1]: pt = shapely.wkb.loads(value[1].decode('hex')) south, west = (pt.x - (size_x / 2)), (pt.y - (size_y / 2)) north, east = (pt.x + (size_x / 2)), (pt.y + (size_y / 2)) new_geom = shapely.geometry.box(south, west, north, east).__geo_interface__ else: new_geom = None new_property = { 'count': value[0], } add_geojson_feature(resp, new_geom, new_property) resp = make_response(json.dumps(resp, default=dthandler), 200) resp.headers['Content-Type'] = 'application/json' return resp
def timeseries(): validator = ParamValidator()\ .set_optional('agg', agg_validator, 'week')\ .set_optional('data_type', make_format_validator(['json', 'csv']), 'json')\ .set_optional('dataset_name__in', list_of_datasets_validator, MetaTable.index)\ .set_optional('obs_date__ge', date_validator, datetime.now() - timedelta(days=90))\ .set_optional('obs_date__le', date_validator, datetime.now())\ .set_optional('location_geom__within', geom_validator, None)\ .set_optional('buffer', int_validator, 100) err = validator.validate(request.args) if err: return bad_request(err) geom = validator.get_geom() table_names = validator.vals['dataset_name__in'] start_date = validator.vals['obs_date__ge'] end_date = validator.vals['obs_date__le'] agg = validator.vals['agg'] # Only examine tables that have a chance of containing records within the date and space boundaries. try: table_names = MetaTable.narrow_candidates(table_names, start_date, end_date, geom) except Exception as e: msg = 'Failed to gather candidate tables.' return internal_error(msg, e) try: panel = MetaTable.timeseries_all(table_names=table_names, agg_unit=agg, start=start_date, end=end_date, geom=geom) except Exception as e: msg = 'Failed to construct timeseries.' return internal_error(msg, e) panel = MetaTable.attach_metadata(panel) resp = json_response_base(validator, panel) datatype = validator.vals['data_type'] if datatype == 'json': resp = make_response(json.dumps(resp, default=dthandler), 200) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': # response format # temporal_group,dataset_name_1,dataset_name_2 # 2014-02-24 00:00:00,235,653 # 2014-03-03 00:00:00,156,624 fields = ['temporal_group'] for o in resp['objects']: fields.append(o['dataset_name']) csv_resp = [] i = 0 for k, g in groupby(resp['objects'], key=itemgetter('dataset_name')): l_g = list(g)[0] j = 0 for row in l_g['items']: # first iteration, populate the first column with temporal_groups if i == 0: csv_resp.append([row['datetime']]) csv_resp[j].append(row['count']) j += 1 i += 1 csv_resp.insert(0, fields) csv_resp = make_csv(csv_resp) resp = make_response(csv_resp, 200) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers[ 'Content-Disposition'] = 'attachment; filename=%s.csv' % filedate return resp
def submit_dataset(): # Slightly dumb way to make sure that POSTs are only coming from # originating domain for the time being referer = request.headers.get('Referer') if referer: referer = urlparse(referer).netloc req_url = urlparse(request.url).netloc if referer != req_url: abort(401) else: abort(401) resp = {'status': 'ok', 'message': ''} status_code = 200 errors = [] post = request.form.get('data') if not post: try: post = request.form.keys()[0] except IndexError: resp['status'] = 'error' resp['message'] = 'Unable to decode POST data' status_code = 400 if status_code == 200: post = json.loads(post) if post.get('view_url'): if post.get('socrata'): source_domain = urlparse(post['view_url']).netloc four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', post['view_url'])[-1] view_url = 'http://%s/api/views/%s' % (source_domain, four_by_four) dataset_info, errors, status_code = get_socrata_data_info( view_url) source_url = '%s/rows.csv?accessType=DOWNLOAD' % view_url else: dataset_info = { 'attribution': '', 'description': '', } source_url = post['view_url'] dataset_info['name'] = urlparse(source_url).path.split('/')[-1] if errors: resp['message'] = ', '.join([e for e in errors]) resp['status'] = 'error' status_code = 400 else: dataset_id = md5(source_url).hexdigest() md = session.query(MetaTable).get(dataset_id) if not md: d = { 'dataset_name': slugify(dataset_info['name'], delim=u'_'), 'human_name': dataset_info['name'], 'attribution': dataset_info['attribution'], 'description': dataset_info['description'], 'source_url': source_url, 'source_url_hash': dataset_id, 'update_freq': post['update_frequency'], 'business_key': post['field_definitions']['id_field'], 'observed_date': post['field_definitions']['date_field'], 'latitude': post['field_definitions'].get('latitude'), 'longitude': post['field_definitions'].get('longitude'), 'location': post['field_definitions'].get('location') } if len(d['dataset_name']) > 49: d['dataset_name'] = d['dataset_name'][:50] md = MetaTable(**d) session.add(md) session.commit() add_dataset.delay(md.source_url_hash, data_types=post.get('data_types')) resp[ 'message'] = 'Dataset %s submitted successfully' % dataset_info[ 'name'] else: resp['status'] = 'error' resp['message'] = 'Must provide a url where data can be downloaded' status_code = 400 resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def _deserialize(self, value, attr, data): try: return MetaTable.get_by_dataset_name(value).point_table except AttributeError: raise ValidationError('{} is not a valid dataset'.format(value))
# ValidatorResult # =============== # Many methods in response.py rely on information that used to be provided # by the old ParamValidator attributes. This namedtuple carries that same # info around, and allows me to not have to rewrite any response code. ValidatorResult = namedtuple('ValidatorResult', 'data errors warnings') # converters # ========== # Callables which are used to convert request arguments to their correct types. converters = { 'buffer': int, 'dataset': lambda x: MetaTable.get_by_dataset_name(x), 'shape': lambda x: ShapeMetadata.get_by_dataset_name(x).shape_table, 'dataset_name__in': lambda x: x.split(','), 'date__time_of_day_ge': int, 'date__time_of_day_le': int, 'obs_date__ge': lambda x: parser.parse(x).date(), 'obs_date__le': lambda x: parser.parse(x).date(), 'offset': int, 'resolution': int, 'geom': lambda x: make_fragment_str(extract_first_geometry_fragment(x)), } def convert(request_args): """Convert a dictionary of arguments from strings to their types. How the values are converted are specified by the converters dictionary defined
def timeseries(): validator = ParamValidator()\ .set_optional('agg', agg_validator, 'week')\ .set_optional('data_type', make_format_validator(['json', 'csv']), 'json')\ .set_optional('dataset_name__in', list_of_datasets_validator, MetaTable.index)\ .set_optional('obs_date__ge', date_validator, datetime.now() - timedelta(days=90))\ .set_optional('obs_date__le', date_validator, datetime.now())\ .set_optional('location_geom__within', geom_validator, None)\ .set_optional('buffer', int_validator, 100) err = validator.validate(request.args) if err: return bad_request(err) geom = validator.get_geom() table_names = validator.vals['dataset_name__in'] start_date = validator.vals['obs_date__ge'] end_date = validator.vals['obs_date__le'] agg = validator.vals['agg'] # Only examine tables that have a chance of containing records within the date and space boundaries. try: table_names = MetaTable.narrow_candidates(table_names, start_date, end_date, geom) except Exception as e: msg = 'Failed to gather candidate tables.' return internal_error(msg, e) try: panel = MetaTable.timeseries_all(table_names=table_names, agg_unit=agg, start=start_date, end=end_date, geom=geom) except Exception as e: msg = 'Failed to construct timeseries.' return internal_error(msg, e) panel = MetaTable.attach_metadata(panel) resp = json_response_base(validator, panel) datatype = validator.vals['data_type'] if datatype == 'json': resp = make_response(json.dumps(resp, default=dthandler), 200) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': # response format # temporal_group,dataset_name_1,dataset_name_2 # 2014-02-24 00:00:00,235,653 # 2014-03-03 00:00:00,156,624 fields = ['temporal_group'] for o in resp['objects']: fields.append(o['dataset_name']) csv_resp = [] i = 0 for k,g in groupby(resp['objects'], key=itemgetter('dataset_name')): l_g = list(g)[0] j = 0 for row in l_g['items']: # first iteration, populate the first column with temporal_groups if i == 0: csv_resp.append([row['datetime']]) csv_resp[j].append(row['count']) j += 1 i += 1 csv_resp.insert(0, fields) csv_resp = make_csv(csv_resp) resp = make_response(csv_resp, 200) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Disposition'] = 'attachment; filename=%s.csv' % filedate return resp
def validate_dataset(dataset_name): if not MetaTable.get_by_dataset_name(dataset_name): raise ValidationError('Invalid table name: {}.'.format(dataset_name))
def _timeseries(args): geom = args.data['geom'] dataset = args.data.get('dataset') table_names = args.data['dataset_name__in'] start_date = args.data['obs_date__ge'] end_date = args.data['obs_date__le'] agg = args.data['agg'] # if a single dataset was provided, it's the only thing we need to consider if dataset is not None: table_names = [dataset.name] # for the query's meta information, so that it doesn't show the index del args.data['dataset_name__in'] # remove table names which wouldn't return anything for the query, given # the time and geom constraints try: table_names = MetaTable.narrow_candidates(table_names, start_date, end_date, geom) except Exception as e: msg = 'Failed to gather candidate tables.' return internal_error(msg, e) # If there aren't any table names, it causes an error down the code. Better # to return and inform them that the request wouldn't have found anything. if not table_names: return bad_request("Your request doesn't return any results. Try " "adjusting your time constraint or location " "parameters.") try: panel = MetaTable.timeseries_all( table_names, agg, start_date, end_date, geom ) except Exception as e: msg = 'Failed to construct timeseries.' return internal_error(msg, e) panel = MetaTable.attach_metadata(panel) resp = json_response_base(args, panel, args.data) datatype = args.data['data_type'] if datatype == 'json': resp = make_response(json.dumps(resp, default=unknown_object_json_handler), 200) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': # response format # temporal_group,dataset_name_1,dataset_name_2 # 2014-02-24 00:00:00,235,653 # 2014-03-03 00:00:00,156,624 fields = ['temporal_group'] for o in resp['objects']: fields.append(o['dataset_name']) csv_resp = [] i = 0 for k, g in groupby(resp['objects'], key=itemgetter('dataset_name')): l_g = list(g)[0] j = 0 for row in l_g['items']: # first iteration, populate the first column with temporal_groups if i == 0: csv_resp.append([row['datetime']]) csv_resp[j].append(row['count']) j += 1 i += 1 csv_resp.insert(0, fields) csv_resp = make_csv(csv_resp) resp = make_response(csv_resp, 200) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Disposition'] = 'attachment; filename=%s.csv' % filedate return resp
def _timeseries(args): meta_params = ['geom', 'dataset', 'dataset_name__in', 'obs_date__ge', 'obs_date__le', 'agg'] meta_vals = [args.data.get(k) for k in meta_params] geom, dataset, table_names, start_date, end_date, agg = meta_vals ctrees = {} if has_tree_filters(args.data): # Timeseries is a little tricky. If there aren't filters, # it would be ridiculous to build a condition tree for every one. for field, value in args.data.items(): if 'filter' in field: # This pattern matches the last occurrence of the '__' pattern. # Prevents an error that is caused by dataset names with trailing # underscores. tablename = re.split(r'__(?!_)', field)[0] metarecord = MetaTable.get_by_dataset_name(tablename) pt = metarecord.point_table ctrees[pt.name] = parse_tree(pt, value) # Just cleanliness, since we don't use this argument. Doesn't have # to show up in the JSON response. del args.data['dataset'] # If no dataset_name__in list was provided, have to fill it in by invoking # MetaTable.index() here! Not in the validator. This way the list stays up # to date. if table_names is None: table_names = MetaTable.index() args.data['dataset_name__in'] = table_names # If a single dataset was provided, it's the only thing we need to consider. if dataset is not None: table_names = [dataset.name] del args.data['dataset_name__in'] # remove table names which wouldn't return anything for the query, given # the time and geom constraints try: table_names = MetaTable.narrow_candidates(table_names, start_date, end_date, geom) except Exception as e: msg = 'Failed to gather candidate tables.' return internal_error(msg, e) # If there aren't any table names, it causes an error down the code. Better # to return and inform them that the request wouldn't have found anything. if not table_names: return bad_request("Your request doesn't return any results. Try " "adjusting your time constraint or location " "parameters.") try: panel = MetaTable.timeseries_all( table_names, agg, start_date, end_date, geom, ctrees ) except Exception as e: msg = 'Failed to construct timeseries.' return internal_error(msg, e) panel = MetaTable.attach_metadata(panel) resp = json_response_base(args, panel, args.data) datatype = args.data['data_type'] if datatype == 'json': resp = make_response(json.dumps(resp, default=unknown_object_json_handler), 200) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': # response format # temporal_group,dataset_name_1,dataset_name_2 # 2014-02-24 00:00:00,235,653 # 2014-03-03 00:00:00,156,624 fields = ['temporal_group'] for o in resp['objects']: fields.append(o['dataset_name']) csv_resp = [] i = 0 for k, g in groupby(resp['objects'], key=itemgetter('dataset_name')): l_g = list(g)[0] j = 0 for row in l_g['items']: # first iteration, populate the first column with temporal_groups if i == 0: csv_resp.append([row['datetime']]) csv_resp[j].append(row['count']) j += 1 i += 1 csv_resp.insert(0, fields) csv_resp = make_csv(csv_resp) resp = make_response(csv_resp, 200) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Disposition'] = 'attachment; filename=%s.csv' % filedate return resp
# =============== # Many methods in response.py rely on information that used to be provided # by the old ParamValidator attributes. This namedtuple carries that same # info around, and allows me to not have to rewrite any response code. ValidatorResult = namedtuple('ValidatorResult', 'data errors warnings') # converters # ========== # Callables which are used to convert request arguments to their correct types. converters = { 'agg': str, 'buffer': int, 'dataset': lambda x: MetaTable.get_by_dataset_name(x).point_table, 'shapeset': lambda x: ShapeMetadata.get_by_dataset_name(x).shape_table, 'data_type': str, 'shape': lambda x: ShapeMetadata.get_by_dataset_name(x).shape_table, 'dataset_name__in': lambda x: x.split(','), 'date__time_of_day_ge': int, 'date__time_of_day_le': int, 'obs_date__ge': lambda x: parser.parse(x).date(), 'obs_date__le': lambda x: parser.parse(x).date(), 'date': lambda x: parser.parse(x).date(), 'point_date': lambda x: parser.parse(x), 'offset': int, 'resolution': int, 'geom': lambda x: make_fragment_str(extract_first_geometry_fragment(x)), }
def validate(validator, request_args): """Validate a dictionary of arguments. Substitute all missing fields with defaults if not explicitly told to do otherwise. :param validator: what kind of validator to use :param request_args: dictionary of arguments from a request object :returns: ValidatorResult namedtuple""" args = request_args.copy() # For validator dataset_name__in... need to find a better way to # make it play nice with the validator. if args.get('dataset_name__in'): args['dataset_name__in'] = args['dataset_name__in'].split(',') # This first validation step covers conditions that are dataset # agnostic. These are values can be used to apply to all datasets # (ex. obs_date), or concern the format of the response (ex. limit, # datatype, offset). # If there are errors, fail quickly and return. result = validator.load(args) if result.errors: return result # If all arguments are valid, fill in validator defaults. result = validator.dump(result.data) # Certain values will be dumped as strings. This conversion # makes them into their corresponding type. (ex. Table) convert(result.data) # Holds messages concerning unnecessary parameters. These can be either # junk parameters, or redundant column parameters if a tree filter was # used. warnings = [] # At this point validation splits. We can either validate tree-style column # arguments or validate them individually. We don't do both. # Determine unchecked parameters provided in the request. unchecked = set(args.keys()) - set(validator.fields.keys()) # If tree filters were provided, ignore ALL unchecked parameters that are # not tree filters or response format information. if has_tree_filters(request_args): for key in request_args: value = args[key] if 'filter' in key: # This pattern matches the last occurrence of the '__' pattern. # Prevents an error that is caused by dataset names with trailing # underscores. t_name = re.split(r'__(?!_)', key)[0] # Report a filter which specifies a non-existent tree. try: table = MetaTable.get_by_dataset_name(t_name).point_table except (AttributeError, NoSuchTableError): try: table = ShapeMetadata.get_by_dataset_name(t_name).shape_table except (AttributeError, NoSuchTableError): result.errors[t_name] = "Table name {} could not be found.".format(t_name) return result # Report a tree which causes the JSON parser to fail. # Or a tree whose value is not valid. try: cond_tree = json.loads(value) if valid_tree(table, cond_tree): result.data[key] = cond_tree except (ValueError, KeyError) as err: result.errors[t_name] = "Bad tree: {} -- causes error {}.".format(value, err) return result # These keys just have to do with the formatting of the JSON response. # We keep these values around even if they have no effect on a condition # tree. elif key in {'geom', 'offset', 'limit', 'agg', 'obs_date__le', 'obs_date__ge'}: pass # These keys are also ones that should be passed over when searching for # unused params. They are used, just in different forms later on, so no need # to report them. elif key in {'shape', 'dataset_name', 'dataset_name__in'}: pass # If the key is not a filter, and not used to format JSON, report # that we ignored it. else: warnings.append("Unused parameter {}, you cannot specify both " "column and filter arguments.".format(key)) # If no tree filters were provided, see if any of the unchecked parameters # are usable as column conditions. else: try: table = result.data['dataset'] except KeyError: table = result.data.get('shapeset') for param in unchecked: field = param.split('__')[0] if table is not None: try: valid_column_condition(table, field, args[param]) result.data[param] = args[param] except KeyError: warnings.append('Unused parameter value "{}={}"'.format(param, args[param])) warnings.append('{} is not a valid column for {}'.format(param, table)) except ValueError: warnings.append('Unused parameter value "{}={}"'.format(param, args[param])) warnings.append('{} is not a valid value for {}'.format(args[param], param)) # ValidatorResult(dict, dict, list) return ValidatorResult(result.data, result.errors, warnings)
def timeseries(): validator = TimeseriesValidator() deserialized_arguments = validator.load(request.args) serialized_arguments = json.loads(validator.dumps(deserialized_arguments.data).data) if deserialized_arguments.errors: return make_error(deserialized_arguments.errors, 400, serialized_arguments) qargs = deserialized_arguments.data agg = qargs['agg'] data_type = qargs['data_type'] geom = qargs['location_geom__within'] pointset = qargs['dataset_name'] pointsets = qargs['dataset_name__in'] start_date = qargs['obs_date__ge'] end_date = qargs['obs_date__le'] ctrees = {} raw_ctrees = {} if has_tree_filters(request.args): # Timeseries is a little tricky. If there aren't filters, # it would be ridiculous to build a condition tree for every one. for field, value in list(request.args.items()): if 'filter' in field: # This pattern matches the last occurrence of the '__' pattern. # Prevents an error that is caused by dataset names with trailing # underscores. tablename = re.split(r'__(?!_)', field)[0] metarecord = MetaTable.get_by_dataset_name(tablename) pt = metarecord.point_table ctrees[pt.name] = parse_tree(pt, json.loads(value)) raw_ctrees[pt.name] = json.loads(value) point_set_names = [p.name for p in pointsets + [pointset] if p is not None] if not point_set_names: point_set_names = MetaTable.index() results = MetaTable.timeseries_all(point_set_names, agg, start_date, end_date, geom, ctrees) payload = { 'meta': { 'message': [], 'query': serialized_arguments, 'status': 'ok', 'total': len(results) }, 'objects': results } if ctrees: payload['meta']['query']['filters'] = raw_ctrees if data_type == 'json': return jsonify(payload) elif data_type == 'csv': # response format # temporal_group,dataset_name_1,dataset_name_2 # 2014-02-24 00:00:00,235,653 # 2014-03-03 00:00:00,156,624 fields = ['temporal_group'] for o in payload['objects']: fields.append(o['dataset_name']) csv_resp = [] i = 0 for k, g in groupby(payload['objects'], key=itemgetter('dataset_name')): l_g = list(g)[0] j = 0 for row in l_g['items']: # first iteration, populate the first column with temporal_groups if i == 0: csv_resp.append([row['datetime']]) csv_resp[j].append(row['count']) j += 1 i += 1 csv_resp.insert(0, fields) csv_resp = make_csv(csv_resp) resp = make_response(csv_resp, 200) resp.headers['Content-Type'] = 'text/csv' filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Disposition'] = 'attachment; filename=%s.csv' % filedate return resp