def point_meta_from_submit_form(form, is_approved): columns, labels = form_columns(form) name = slugify(form['dataset_name'], delimiter='_')[:50] metatable = MetaTable( url=form['file_url'], view_url=form.get('view_url'), dataset_name=name, human_name=form['dataset_name'], attribution=form.get('dataset_attribution'), description=form.get('dataset_description'), update_freq=form['update_frequency'], contributor_name=form['contributor_name'], contributor_organization=form.get('contributor_organization'), contributor_email=form['contributor_email'], approved_status=is_approved, observed_date=labels['observed_date'], latitude=labels.get('latitude', None), longitude=labels.get('longitude', None), location=labels.get('location', None), column_names=columns) postgres_session.add(metatable) postgres_session.commit() return metatable
def point_meta_from_submit_form(form, is_approved): column_names, labels = form_columns(form) name = slugify(form['dataset_name'], delim=u'_')[:50] md = MetaTable( url=form['file_url'], view_url=form.get('view_url'), dataset_name=name, human_name=form['dataset_name'], attribution=form.get('dataset_attribution'), description=form.get('dataset_description'), update_freq=form['update_frequency'], contributor_name=form['contributor_name'], contributor_organization=form.get('contributor_organization'), contributor_email=form['contributor_email'], approved_status=is_approved, observed_date=labels['observed_date'], latitude=labels.get('latitude', None), longitude=labels.get('longitude', None), location=labels.get('location', None), column_names=column_names ) session.add(md) session.commit() return md
def add_dataset_to_metatable(request, url, dataset_id, dataset_info, socrata_source, approved_status): data_types = [] business_key = None observed_date = None latitude = None longitude = None location = None for k, v in request.form.iteritems(): if k.startswith('data_type_'): key = k.replace("data_type_", "") data_types.append({"field_name": key, "data_type": v}) if k.startswith('key_type_'): key = k.replace("key_type_", "") if (v == "business_key"): business_key = key if (v == "observed_date"): observed_date = key if (v == "latitude"): latitude = key if (v == "longitude"): longitude = key if (v == "location"): location = key if socrata_source: data_types = dataset_info['columns'] url = dataset_info['source_url'] d = { 'dataset_name': slugify(request.form.get('dataset_name'), delim=u'_')[:50], 'human_name': request.form.get('dataset_name'), 'attribution': request.form.get('dataset_attribution'), 'description': request.form.get('dataset_description'), 'source_url': url, 'source_url_hash': dataset_id, 'update_freq': request.form.get('update_frequency'), 'business_key': business_key, 'observed_date': observed_date, 'latitude': latitude, 'longitude': longitude, 'location': location, 'contributor_name': request.form.get('contributor_name'), 'contributor_organization': request.form.get('contributor_organization'), 'contributor_email': request.form.get('contributor_email'), 'contributed_data_types': json.dumps(data_types), 'approved_status': approved_status, 'is_socrata_source': socrata_source } # add this to meta_master md = MetaTable(**d) session.add(md) session.commit() return md
def detail(): raw_query_params = request.args.copy() # if no obs_date given, default to >= 30 days ago obs_dates = [ i for i in raw_query_params.keys() if i.startswith('obs_date') ] if not obs_dates: six_months_ago = datetime.now() - timedelta(days=30) raw_query_params['obs_date__ge'] = six_months_ago.strftime('%Y-%m-%d') include_weather = False if raw_query_params.get('weather') is not None: include_weather = raw_query_params['weather'] del raw_query_params['weather'] agg, datatype, queries = parse_join_query(raw_query_params) order_by = raw_query_params.get('order_by') offset = raw_query_params.get('offset') mt = MasterTable.__table__ valid_query, base_clauses, resp, status_code = make_query( mt, queries['base']) if not raw_query_params.get('dataset_name'): valid_query = False resp['meta'] = { 'status': 'error', 'message': "'dataset_name' is required" } resp['objects'] = [] if valid_query: resp['meta']['status'] = 'ok' dname = raw_query_params['dataset_name'] dataset = Table('dat_%s' % dname, Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) dataset_fields = dataset.columns.keys() base_query = session.query(mt, dataset) if include_weather: date_col_name = 'date' try: date_col_name = slugify(session.query(MetaTable)\ .filter(MetaTable.dataset_name == dname)\ .first().observed_date) except AttributeError: pass date_col_type = str(getattr(dataset.c, date_col_name).type).lower() if 'timestamp' in date_col_type: weather_tname = 'hourly' else: weather_tname = 'daily' weather_table = Table('dat_weather_observations_%s' % weather_tname, Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) weather_fields = weather_table.columns.keys() base_query = session.query(mt, dataset, weather_table) valid_query, detail_clauses, resp, status_code = make_query( dataset, queries['detail']) if valid_query: resp['meta']['status'] = 'ok' pk = [p.name for p in dataset.primary_key][0] base_query = base_query.join(dataset, mt.c.dataset_row_id == dataset.c[pk]) for clause in base_clauses: base_query = base_query.filter(clause) for clause in detail_clauses: base_query = base_query.filter(clause) if include_weather: w_q = {} if queries['weather']: for k, v in queries['weather'].items(): try: fname, operator = k.split('__') except ValueError: operator = 'eq' pass t_fname = WEATHER_COL_LOOKUP[weather_tname].get( fname, fname) w_q['__'.join([t_fname, operator])] = v valid_query, weather_clauses, resp, status_code = make_query( weather_table, w_q) if valid_query: resp['meta']['status'] = 'ok' base_query = base_query.join( weather_table, mt.c.weather_observation_id == weather_table.c.id) for clause in weather_clauses: base_query = base_query.filter(clause) if valid_query: if order_by: col, order = order_by.split(',') base_query = base_query.order_by( getattr(mt.c[col], order)()) else: base_query = base_query.order_by(mt.c.master_row_id.asc()) base_query = base_query.limit(RESPONSE_LIMIT) if offset: base_query = base_query.offset(int(offset)) values = [r for r in base_query.all()] for value in values: d = {f: getattr(value, f) for f in dataset_fields} if include_weather: d = { 'observation': {f: getattr(value, f) for f in dataset_fields}, 'weather': {f: getattr(value, f) for f in weather_fields}, } resp['objects'].append(d) resp['meta']['query'] = raw_query_params loc = resp['meta']['query'].get('location_geom__within') if loc: resp['meta']['query'][ 'location_geom__within'] = json.loads(loc) resp['meta']['total'] = len(resp['objects']) if datatype == 'json': resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': csv_resp = [dataset_fields] if include_weather: csv_resp = [dataset_fields + weather_fields] for value in values: d = [getattr(value, f) for f in dataset_fields] if include_weather: d.extend([getattr(value, f) for f in weather_fields]) csv_resp.append(d) resp = make_response(make_csv(csv_resp), 200) filedate = datetime.now().strftime('%Y-%m-%d') dname = raw_query_params['dataset_name'] filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Type'] = 'text/csv' resp.headers[ 'Content-Disposition'] = 'attachment; filename=%s_%s.csv' % ( dname, filedate) return resp
def submit_dataset(): # Slightly dumb way to make sure that POSTs are only coming from # originating domain for the time being referer = request.headers.get('Referer') if referer: referer = urlparse(referer).netloc req_url = urlparse(request.url).netloc if referer != req_url: abort(401) else: abort(401) resp = {'status': 'ok', 'message': ''} status_code = 200 errors = [] post = request.form.get('data') if not post: try: post = request.form.keys()[0] except IndexError: resp['status'] = 'error' resp['message'] = 'Unable to decode POST data' status_code = 400 if status_code == 200: post = json.loads(post) if post.get('view_url'): if post.get('socrata'): source_domain = urlparse(post['view_url']).netloc four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', post['view_url'])[-1] view_url = 'http://%s/api/views/%s' % (source_domain, four_by_four) dataset_info, errors, status_code = get_socrata_data_info(view_url) source_url = '%s/rows.csv?accessType=DOWNLOAD' % view_url else: dataset_info = { 'attribution': '', 'description': '', } source_url = post['view_url'] dataset_info['name'] = urlparse(source_url).path.split('/')[-1] if errors: resp['message'] = ', '.join([e for e in errors]) resp['status'] = 'error' status_code = 400 else: dataset_id = md5(source_url).hexdigest() md = session.query(MetaTable).get(dataset_id) if not md: d = { 'dataset_name': slugify(dataset_info['name'], delim=u'_'), 'human_name': dataset_info['name'], 'attribution': dataset_info['attribution'], 'description': dataset_info['description'], 'source_url': source_url, 'source_url_hash': dataset_id, 'update_freq': post['update_frequency'], 'business_key': post['field_definitions']['id_field'], 'observed_date': post['field_definitions']['date_field'], 'latitude': post['field_definitions'].get('latitude'), 'longitude': post['field_definitions'].get('longitude'), 'location': post['field_definitions'].get('location') } if len(d['dataset_name']) > 49: d['dataset_name'] = d['dataset_name'][:50] md = MetaTable(**d) session.add(md) session.commit() add_dataset.delay(md.source_url_hash, data_types=post.get('data_types')) resp['message'] = 'Dataset %s submitted successfully' % dataset_info['name'] else: resp['status'] = 'error' resp['message'] = 'Must provide a url where data can be downloaded' status_code = 400 resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def detail(): raw_query_params = request.args.copy() # if no obs_date given, default to >= 30 days ago obs_dates = [i for i in raw_query_params.keys() if i.startswith('obs_date')] if not obs_dates: six_months_ago = datetime.now() - timedelta(days=30) raw_query_params['obs_date__ge'] = six_months_ago.strftime('%Y-%m-%d') include_weather = False if raw_query_params.get('weather') is not None: include_weather = raw_query_params['weather'] del raw_query_params['weather'] agg, datatype, queries = parse_join_query(raw_query_params) order_by = raw_query_params.get('order_by') offset = raw_query_params.get('offset') mt = MasterTable.__table__ valid_query, base_clauses, resp, status_code = make_query(mt, queries['base']) if not raw_query_params.get('dataset_name'): valid_query = False resp['meta'] = { 'status': 'error', 'message': "'dataset_name' is required" } resp['objects'] = [] if valid_query: resp['meta']['status'] = 'ok' dname = raw_query_params['dataset_name'] dataset = Table('dat_%s' % dname, Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) dataset_fields = dataset.columns.keys() base_query = session.query(mt, dataset) if include_weather: date_col_name = 'date' try: date_col_name = slugify(session.query(MetaTable)\ .filter(MetaTable.dataset_name == dname)\ .first().observed_date) except AttributeError: pass date_col_type = str(getattr(dataset.c, date_col_name).type).lower() if 'timestamp' in date_col_type: weather_tname = 'hourly' else: weather_tname = 'daily' weather_table = Table('dat_weather_observations_%s' % weather_tname, Base.metadata, autoload=True, autoload_with=engine, extend_existing=True) weather_fields = weather_table.columns.keys() base_query = session.query(mt, dataset, weather_table) valid_query, detail_clauses, resp, status_code = make_query(dataset, queries['detail']) if valid_query: resp['meta']['status'] = 'ok' pk = [p.name for p in dataset.primary_key][0] base_query = base_query.join(dataset, mt.c.dataset_row_id == dataset.c[pk]) for clause in base_clauses: base_query = base_query.filter(clause) for clause in detail_clauses: base_query = base_query.filter(clause) if include_weather: w_q = {} if queries['weather']: for k,v in queries['weather'].items(): try: fname, operator = k.split('__') except ValueError: operator = 'eq' pass t_fname = WEATHER_COL_LOOKUP[weather_tname].get(fname, fname) w_q['__'.join([t_fname, operator])] = v valid_query, weather_clauses, resp, status_code = make_query(weather_table, w_q) if valid_query: resp['meta']['status'] = 'ok' base_query = base_query.join(weather_table, mt.c.weather_observation_id == weather_table.c.id) for clause in weather_clauses: base_query = base_query.filter(clause) if valid_query: if order_by: col, order = order_by.split(',') base_query = base_query.order_by(getattr(mt.c[col], order)()) else: base_query = base_query.order_by(mt.c.master_row_id.asc()) base_query = base_query.limit(RESPONSE_LIMIT) if offset: base_query = base_query.offset(int(offset)) values = [r for r in base_query.all()] for value in values: d = {f:getattr(value, f) for f in dataset_fields} if include_weather: d = { 'observation': {f:getattr(value, f) for f in dataset_fields}, 'weather': {f:getattr(value, f) for f in weather_fields}, } resp['objects'].append(d) resp['meta']['query'] = raw_query_params loc = resp['meta']['query'].get('location_geom__within') if loc: resp['meta']['query']['location_geom__within'] = json.loads(loc) resp['meta']['total'] = len(resp['objects']) if datatype == 'json': resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' elif datatype == 'csv': csv_resp = [dataset_fields] if include_weather: csv_resp = [dataset_fields + weather_fields] for value in values: d = [getattr(value, f) for f in dataset_fields] if include_weather: d.extend([getattr(value, f) for f in weather_fields]) csv_resp.append(d) resp = make_response(make_csv(csv_resp), 200) filedate = datetime.now().strftime('%Y-%m-%d') dname = raw_query_params['dataset_name'] filedate = datetime.now().strftime('%Y-%m-%d') resp.headers['Content-Type'] = 'text/csv' resp.headers['Content-Disposition'] = 'attachment; filename=%s_%s.csv' % (dname, filedate) return resp
def curried_slug(name): if name is None: return None else: return slugify(unicode(name), delim=u'_')
def test_slugify(self): from plenario.utils.helpers import slugify self.assertEqual(slugify("A-Awef-Basdf-123"), "a_awef_basdf_123")
from sqlalchemy import Table, MetaData from sqlalchemy.sql.sqltypes import DATE from plenario.database import session, app_engine from plenario.models import MetaTable from sqlalchemy.exc import NoSuchTableError from plenario.utils.helpers import slugify from plenario.utils.etl import PlenarioETL import traceback if __name__ == "__main__": them = session.query(MetaTable).all() meta = MetaData() for t in them: try: table = Table('dat_{0}'.format(t.dataset_name), meta, autoload=True, autoload_with=app_engine, keep_existing=True) try: date_col = getattr(table.c, slugify(t.observed_date)) if type(date_col.type) == DATE: e = PlenarioETL(t.as_dict()) e._get_or_create_data_table() e._add_weather_info() print 'added weather for {0}'.format(t.dataset_name) except AttributeError, e: raise e print 'no col {0}'.format(t.observed_date) pass except NoSuchTableError: print 'no table {0}'.format(t.dataset_name) pass
def submit_dataset(): # Slightly dumb way to make sure that POSTs are only coming from # originating domain for the time being referer = request.headers.get('Referer') if referer: referer = urlparse(referer).netloc req_url = urlparse(request.url).netloc if referer != req_url: abort(401) else: abort(401) resp = {'status': 'ok', 'message': ''} status_code = 200 errors = [] post = request.form.get('data') if not post: try: post = request.form.keys()[0] except IndexError: resp['status'] = 'error' resp['message'] = 'Unable to decode POST data' status_code = 400 if status_code == 200: post = json.loads(post) if post.get('view_url'): if post.get('socrata'): source_domain = urlparse(post['view_url']).netloc four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', post['view_url'])[-1] view_url = 'http://%s/api/views/%s' % (source_domain, four_by_four) dataset_info, errors, status_code = get_socrata_data_info( view_url) source_url = '%s/rows.csv?accessType=DOWNLOAD' % view_url else: dataset_info = { 'attribution': '', 'description': '', } source_url = post['view_url'] dataset_info['name'] = urlparse(source_url).path.split('/')[-1] if errors: resp['message'] = ', '.join([e for e in errors]) resp['status'] = 'error' status_code = 400 else: dataset_id = md5(source_url).hexdigest() md = session.query(MetaTable).get(dataset_id) if not md: d = { 'dataset_name': slugify(dataset_info['name'], delim=u'_'), 'human_name': dataset_info['name'], 'attribution': dataset_info['attribution'], 'description': dataset_info['description'], 'source_url': source_url, 'source_url_hash': dataset_id, 'update_freq': post['update_frequency'], 'business_key': post['field_definitions']['id_field'], 'observed_date': post['field_definitions']['date_field'], 'latitude': post['field_definitions'].get('latitude'), 'longitude': post['field_definitions'].get('longitude'), 'location': post['field_definitions'].get('location') } if len(d['dataset_name']) > 49: d['dataset_name'] = d['dataset_name'][:50] md = MetaTable(**d) session.add(md) session.commit() add_dataset.delay(md.source_url_hash, data_types=post.get('data_types')) resp[ 'message'] = 'Dataset %s submitted successfully' % dataset_info[ 'name'] else: resp['status'] = 'error' resp['message'] = 'Must provide a url where data can be downloaded' status_code = 400 resp = make_response(json.dumps(resp, default=dthandler), status_code) resp.headers['Content-Type'] = 'application/json' return resp
def curried_slug(name): if name is None: return None else: return slugify(str(name), delimiter='_')
def make_table_name(cls, human_name): return slugify(human_name)
from plenario.models import MetaTable from sqlalchemy.exc import NoSuchTableError from plenario.utils.helpers import slugify from plenario.utils.etl import PlenarioETL import traceback if __name__ == "__main__": them = session.query(MetaTable).all() meta = MetaData() for t in them: try: table = Table('dat_{0}'.format(t.dataset_name), meta, autoload=True, autoload_with=app_engine, keep_existing=True) try: date_col = getattr(table.c, slugify(t.observed_date)) if type(date_col.type) == DATE: e = PlenarioETL(t.as_dict()) e._get_or_create_data_table() e._add_weather_info() print 'added weather for {0}'.format(t.dataset_name) except AttributeError, e: raise e print 'no col {0}'.format(t.observed_date) pass except NoSuchTableError: print 'no table {0}'.format(t.dataset_name) pass