Exemple #1
0
def point_meta_from_submit_form(form, is_approved):
    columns, labels = form_columns(form)
    name = slugify(form['dataset_name'], delimiter='_')[:50]

    metatable = MetaTable(
        url=form['file_url'],
        view_url=form.get('view_url'),
        dataset_name=name,
        human_name=form['dataset_name'],
        attribution=form.get('dataset_attribution'),
        description=form.get('dataset_description'),
        update_freq=form['update_frequency'],
        contributor_name=form['contributor_name'],
        contributor_organization=form.get('contributor_organization'),
        contributor_email=form['contributor_email'],
        approved_status=is_approved,
        observed_date=labels['observed_date'],
        latitude=labels.get('latitude', None),
        longitude=labels.get('longitude', None),
        location=labels.get('location', None),
        column_names=columns)

    postgres_session.add(metatable)
    postgres_session.commit()
    return metatable
Exemple #2
0
def point_meta_from_submit_form(form, is_approved):
    column_names, labels = form_columns(form)
    name = slugify(form['dataset_name'], delim=u'_')[:50]

    md = MetaTable(
        url=form['file_url'],
        view_url=form.get('view_url'),
        dataset_name=name,
        human_name=form['dataset_name'],
        attribution=form.get('dataset_attribution'),
        description=form.get('dataset_description'),
        update_freq=form['update_frequency'],
        contributor_name=form['contributor_name'],
        contributor_organization=form.get('contributor_organization'),
        contributor_email=form['contributor_email'],
        approved_status=is_approved,
        observed_date=labels['observed_date'],
        latitude=labels.get('latitude', None),
        longitude=labels.get('longitude', None),
        location=labels.get('location', None),
        column_names=column_names
    )
    session.add(md)
    session.commit()
    return md
Exemple #3
0
def add_dataset_to_metatable(request, url, dataset_id, dataset_info,
                             socrata_source, approved_status):
    data_types = []
    business_key = None
    observed_date = None
    latitude = None
    longitude = None
    location = None
    for k, v in request.form.iteritems():
        if k.startswith('data_type_'):
            key = k.replace("data_type_", "")
            data_types.append({"field_name": key, "data_type": v})

        if k.startswith('key_type_'):
            key = k.replace("key_type_", "")
            if (v == "business_key"): business_key = key
            if (v == "observed_date"): observed_date = key
            if (v == "latitude"): latitude = key
            if (v == "longitude"): longitude = key
            if (v == "location"): location = key

    if socrata_source:
        data_types = dataset_info['columns']
        url = dataset_info['source_url']

    d = {
        'dataset_name': slugify(request.form.get('dataset_name'),
                                delim=u'_')[:50],
        'human_name': request.form.get('dataset_name'),
        'attribution': request.form.get('dataset_attribution'),
        'description': request.form.get('dataset_description'),
        'source_url': url,
        'source_url_hash': dataset_id,
        'update_freq': request.form.get('update_frequency'),
        'business_key': business_key,
        'observed_date': observed_date,
        'latitude': latitude,
        'longitude': longitude,
        'location': location,
        'contributor_name': request.form.get('contributor_name'),
        'contributor_organization':
        request.form.get('contributor_organization'),
        'contributor_email': request.form.get('contributor_email'),
        'contributed_data_types': json.dumps(data_types),
        'approved_status': approved_status,
        'is_socrata_source': socrata_source
    }

    # add this to meta_master
    md = MetaTable(**d)
    session.add(md)
    session.commit()

    return md
Exemple #4
0
def add_dataset_to_metatable(request, url, dataset_id, dataset_info, socrata_source, approved_status):
    data_types = []
    business_key = None
    observed_date = None
    latitude = None
    longitude = None 
    location = None
    for k, v in request.form.iteritems():
        if k.startswith('data_type_'):
            key = k.replace("data_type_", "")
            data_types.append({"field_name": key, "data_type": v})

        if k.startswith('key_type_'):
            key = k.replace("key_type_", "")
            if (v == "business_key"): business_key = key
            if (v == "observed_date"): observed_date = key
            if (v == "latitude"): latitude = key
            if (v == "longitude"): longitude = key
            if (v == "location"): location = key

    if socrata_source:
        data_types = dataset_info['columns']
        url = dataset_info['source_url']

    d = {
        'dataset_name': slugify(request.form.get('dataset_name'), delim=u'_')[:50],
        'human_name': request.form.get('dataset_name'),
        'attribution': request.form.get('dataset_attribution'),
        'description': request.form.get('dataset_description'),
        'source_url': url,
        'source_url_hash': dataset_id,
        'update_freq': request.form.get('update_frequency'),
        'business_key': business_key,
        'observed_date': observed_date,
        'latitude': latitude,
        'longitude': longitude,
        'location': location,
        'contributor_name': request.form.get('contributor_name'),
        'contributor_organization': request.form.get('contributor_organization'),
        'contributor_email': request.form.get('contributor_email'),
        'contributed_data_types': json.dumps(data_types),
        'approved_status': approved_status,
        'is_socrata_source': socrata_source
    }

    # add this to meta_master
    md = MetaTable(**d)
    session.add(md)
    session.commit()

    return md
Exemple #5
0
def detail():
    raw_query_params = request.args.copy()
    # if no obs_date given, default to >= 30 days ago
    obs_dates = [
        i for i in raw_query_params.keys() if i.startswith('obs_date')
    ]
    if not obs_dates:
        six_months_ago = datetime.now() - timedelta(days=30)
        raw_query_params['obs_date__ge'] = six_months_ago.strftime('%Y-%m-%d')

    include_weather = False
    if raw_query_params.get('weather') is not None:
        include_weather = raw_query_params['weather']
        del raw_query_params['weather']
    agg, datatype, queries = parse_join_query(raw_query_params)
    order_by = raw_query_params.get('order_by')
    offset = raw_query_params.get('offset')
    mt = MasterTable.__table__
    valid_query, base_clauses, resp, status_code = make_query(
        mt, queries['base'])
    if not raw_query_params.get('dataset_name'):
        valid_query = False
        resp['meta'] = {
            'status': 'error',
            'message': "'dataset_name' is required"
        }
        resp['objects'] = []
    if valid_query:
        resp['meta']['status'] = 'ok'
        dname = raw_query_params['dataset_name']
        dataset = Table('dat_%s' % dname,
                        Base.metadata,
                        autoload=True,
                        autoload_with=engine,
                        extend_existing=True)
        dataset_fields = dataset.columns.keys()
        base_query = session.query(mt, dataset)
        if include_weather:
            date_col_name = 'date'
            try:
                date_col_name = slugify(session.query(MetaTable)\
                    .filter(MetaTable.dataset_name == dname)\
                    .first().observed_date)
            except AttributeError:
                pass
            date_col_type = str(getattr(dataset.c, date_col_name).type).lower()
            if 'timestamp' in date_col_type:
                weather_tname = 'hourly'
            else:
                weather_tname = 'daily'
            weather_table = Table('dat_weather_observations_%s' %
                                  weather_tname,
                                  Base.metadata,
                                  autoload=True,
                                  autoload_with=engine,
                                  extend_existing=True)
            weather_fields = weather_table.columns.keys()
            base_query = session.query(mt, dataset, weather_table)
        valid_query, detail_clauses, resp, status_code = make_query(
            dataset, queries['detail'])
        if valid_query:
            resp['meta']['status'] = 'ok'
            pk = [p.name for p in dataset.primary_key][0]
            base_query = base_query.join(dataset,
                                         mt.c.dataset_row_id == dataset.c[pk])
            for clause in base_clauses:
                base_query = base_query.filter(clause)
            for clause in detail_clauses:
                base_query = base_query.filter(clause)
            if include_weather:
                w_q = {}
                if queries['weather']:
                    for k, v in queries['weather'].items():
                        try:
                            fname, operator = k.split('__')
                        except ValueError:
                            operator = 'eq'
                            pass
                        t_fname = WEATHER_COL_LOOKUP[weather_tname].get(
                            fname, fname)
                        w_q['__'.join([t_fname, operator])] = v
                valid_query, weather_clauses, resp, status_code = make_query(
                    weather_table, w_q)
                if valid_query:
                    resp['meta']['status'] = 'ok'
                    base_query = base_query.join(
                        weather_table,
                        mt.c.weather_observation_id == weather_table.c.id)
                    for clause in weather_clauses:
                        base_query = base_query.filter(clause)
            if valid_query:
                if order_by:
                    col, order = order_by.split(',')
                    base_query = base_query.order_by(
                        getattr(mt.c[col], order)())
                else:
                    base_query = base_query.order_by(mt.c.master_row_id.asc())
                base_query = base_query.limit(RESPONSE_LIMIT)
                if offset:
                    base_query = base_query.offset(int(offset))
                values = [r for r in base_query.all()]
                for value in values:
                    d = {f: getattr(value, f) for f in dataset_fields}
                    if include_weather:
                        d = {
                            'observation':
                            {f: getattr(value, f)
                             for f in dataset_fields},
                            'weather':
                            {f: getattr(value, f)
                             for f in weather_fields},
                        }
                    resp['objects'].append(d)
                resp['meta']['query'] = raw_query_params
                loc = resp['meta']['query'].get('location_geom__within')
                if loc:
                    resp['meta']['query'][
                        'location_geom__within'] = json.loads(loc)
                resp['meta']['total'] = len(resp['objects'])
    if datatype == 'json':
        resp = make_response(json.dumps(resp, default=dthandler), status_code)
        resp.headers['Content-Type'] = 'application/json'
    elif datatype == 'csv':
        csv_resp = [dataset_fields]
        if include_weather:
            csv_resp = [dataset_fields + weather_fields]
        for value in values:
            d = [getattr(value, f) for f in dataset_fields]
            if include_weather:
                d.extend([getattr(value, f) for f in weather_fields])
            csv_resp.append(d)
        resp = make_response(make_csv(csv_resp), 200)
        filedate = datetime.now().strftime('%Y-%m-%d')
        dname = raw_query_params['dataset_name']
        filedate = datetime.now().strftime('%Y-%m-%d')
        resp.headers['Content-Type'] = 'text/csv'
        resp.headers[
            'Content-Disposition'] = 'attachment; filename=%s_%s.csv' % (
                dname, filedate)
    return resp
Exemple #6
0
def submit_dataset():
    # Slightly dumb way to make sure that POSTs are only coming from
    # originating domain for the time being
    referer = request.headers.get('Referer')
    if referer:
        referer = urlparse(referer).netloc
        req_url = urlparse(request.url).netloc
        if referer != req_url:
            abort(401)
    else:
        abort(401)
    resp = {'status': 'ok', 'message': ''}
    status_code = 200
    errors = []
    post = request.form.get('data')
    if not post:
        try:
            post = request.form.keys()[0]
        except IndexError:
            resp['status'] = 'error'
            resp['message'] = 'Unable to decode POST data'
            status_code = 400
    if status_code == 200:
        post = json.loads(post)
        if post.get('view_url'):
            if post.get('socrata'):
                source_domain = urlparse(post['view_url']).netloc
                four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})', post['view_url'])[-1]
                view_url = 'http://%s/api/views/%s' % (source_domain, four_by_four)
                dataset_info, errors, status_code = get_socrata_data_info(view_url)
                source_url = '%s/rows.csv?accessType=DOWNLOAD' % view_url
            else:
                dataset_info = {
                    'attribution': '',
                    'description': '',
                }
                source_url = post['view_url']
                dataset_info['name'] = urlparse(source_url).path.split('/')[-1]
            if errors:
                resp['message'] = ', '.join([e for e in errors])
                resp['status'] = 'error'
                status_code = 400
            else:
                dataset_id = md5(source_url).hexdigest()
                md = session.query(MetaTable).get(dataset_id)
                if not md:
                    d = {
                        'dataset_name': slugify(dataset_info['name'], delim=u'_'),
                        'human_name': dataset_info['name'],
                        'attribution': dataset_info['attribution'],
                        'description': dataset_info['description'],
                        'source_url': source_url,
                        'source_url_hash': dataset_id,
                        'update_freq': post['update_frequency'],
                        'business_key': post['field_definitions']['id_field'],
                        'observed_date': post['field_definitions']['date_field'],
                        'latitude': post['field_definitions'].get('latitude'),
                        'longitude': post['field_definitions'].get('longitude'),
                        'location': post['field_definitions'].get('location')
                    }
                    if len(d['dataset_name']) > 49:
                        d['dataset_name'] = d['dataset_name'][:50]
                    md = MetaTable(**d)
                    session.add(md)
                    session.commit()
                add_dataset.delay(md.source_url_hash, data_types=post.get('data_types'))
                resp['message'] = 'Dataset %s submitted successfully' % dataset_info['name']
        else:
            resp['status'] = 'error'
            resp['message'] = 'Must provide a url where data can be downloaded'
            status_code = 400
    resp = make_response(json.dumps(resp, default=dthandler), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
Exemple #7
0
def detail():
    raw_query_params = request.args.copy()
    # if no obs_date given, default to >= 30 days ago
    obs_dates = [i for i in raw_query_params.keys() if i.startswith('obs_date')]
    if not obs_dates:
        six_months_ago = datetime.now() - timedelta(days=30)
        raw_query_params['obs_date__ge'] = six_months_ago.strftime('%Y-%m-%d')
    
    include_weather = False
    if raw_query_params.get('weather') is not None:
        include_weather = raw_query_params['weather']
        del raw_query_params['weather']
    agg, datatype, queries = parse_join_query(raw_query_params)
    order_by = raw_query_params.get('order_by')
    offset = raw_query_params.get('offset')
    mt = MasterTable.__table__
    valid_query, base_clauses, resp, status_code = make_query(mt, queries['base'])
    if not raw_query_params.get('dataset_name'):
        valid_query = False
        resp['meta'] = {
            'status': 'error',
            'message': "'dataset_name' is required"
        }
        resp['objects'] = []
    if valid_query:
        resp['meta']['status'] = 'ok'
        dname = raw_query_params['dataset_name']
        dataset = Table('dat_%s' % dname, Base.metadata,
            autoload=True, autoload_with=engine,
            extend_existing=True)
        dataset_fields = dataset.columns.keys()
        base_query = session.query(mt, dataset)
        if include_weather:
            date_col_name = 'date'
            try:
                date_col_name = slugify(session.query(MetaTable)\
                    .filter(MetaTable.dataset_name == dname)\
                    .first().observed_date)
            except AttributeError:
                pass
            date_col_type = str(getattr(dataset.c, date_col_name).type).lower()
            if 'timestamp' in date_col_type:
                weather_tname = 'hourly'
            else:
                weather_tname = 'daily'
            weather_table = Table('dat_weather_observations_%s' % weather_tname, Base.metadata, 
                autoload=True, autoload_with=engine, extend_existing=True)
            weather_fields = weather_table.columns.keys()
            base_query = session.query(mt, dataset, weather_table)
        valid_query, detail_clauses, resp, status_code = make_query(dataset, queries['detail'])
        if valid_query:
            resp['meta']['status'] = 'ok'
            pk = [p.name for p in dataset.primary_key][0]
            base_query = base_query.join(dataset, mt.c.dataset_row_id == dataset.c[pk])
            for clause in base_clauses:
                base_query = base_query.filter(clause)
            for clause in detail_clauses:
                base_query = base_query.filter(clause)
            if include_weather:
                w_q = {}
                if queries['weather']:
                    for k,v in queries['weather'].items():
                        try:
                            fname, operator = k.split('__')
                        except ValueError:
                            operator = 'eq'
                            pass
                        t_fname = WEATHER_COL_LOOKUP[weather_tname].get(fname, fname)
                        w_q['__'.join([t_fname, operator])] = v
                valid_query, weather_clauses, resp, status_code = make_query(weather_table, w_q)
                if valid_query:
                    resp['meta']['status'] = 'ok'
                    base_query = base_query.join(weather_table, mt.c.weather_observation_id == weather_table.c.id)
                    for clause in weather_clauses:
                        base_query = base_query.filter(clause)
            if valid_query:
                if order_by:
                    col, order = order_by.split(',')
                    base_query = base_query.order_by(getattr(mt.c[col], order)())
                else:
                    base_query = base_query.order_by(mt.c.master_row_id.asc())
                base_query = base_query.limit(RESPONSE_LIMIT)
                if offset:
                    base_query = base_query.offset(int(offset))
                values = [r for r in base_query.all()]
                for value in values:
                    d = {f:getattr(value, f) for f in dataset_fields}
                    if include_weather:
                        d = {
                            'observation': {f:getattr(value, f) for f in dataset_fields},
                            'weather': {f:getattr(value, f) for f in weather_fields},
                        }
                    resp['objects'].append(d)
                resp['meta']['query'] = raw_query_params
                loc = resp['meta']['query'].get('location_geom__within')
                if loc:
                    resp['meta']['query']['location_geom__within'] = json.loads(loc)
                resp['meta']['total'] = len(resp['objects'])
    if datatype == 'json':
        resp = make_response(json.dumps(resp, default=dthandler), status_code)
        resp.headers['Content-Type'] = 'application/json'
    elif datatype == 'csv':
        csv_resp = [dataset_fields]
        if include_weather:
            csv_resp = [dataset_fields + weather_fields]
        for value in values:
            d = [getattr(value, f) for f in dataset_fields]
            if include_weather:
                d.extend([getattr(value, f) for f in weather_fields])
            csv_resp.append(d)
        resp = make_response(make_csv(csv_resp), 200)
        filedate = datetime.now().strftime('%Y-%m-%d')
        dname = raw_query_params['dataset_name']
        filedate = datetime.now().strftime('%Y-%m-%d')
        resp.headers['Content-Type'] = 'text/csv'
        resp.headers['Content-Disposition'] = 'attachment; filename=%s_%s.csv' % (dname, filedate)
    return resp
Exemple #8
0
 def curried_slug(name):
     if name is None:
         return None
     else:
         return slugify(unicode(name), delim=u'_')
 def test_slugify(self):
     from plenario.utils.helpers import slugify
     self.assertEqual(slugify("A-Awef-Basdf-123"), "a_awef_basdf_123")
Exemple #10
0
from sqlalchemy import Table, MetaData
from sqlalchemy.sql.sqltypes import DATE
from plenario.database import session, app_engine
from plenario.models import MetaTable
from sqlalchemy.exc import NoSuchTableError
from plenario.utils.helpers import slugify
from plenario.utils.etl import PlenarioETL
import traceback

if __name__ == "__main__":
    them = session.query(MetaTable).all()
    meta = MetaData()
    for t in them:
        try:
            table = Table('dat_{0}'.format(t.dataset_name), meta, 
                autoload=True, autoload_with=app_engine, keep_existing=True)
            try:
               date_col = getattr(table.c, slugify(t.observed_date))
               if type(date_col.type) == DATE:
                   e = PlenarioETL(t.as_dict())
                   e._get_or_create_data_table()
                   e._add_weather_info()
                   print 'added weather for {0}'.format(t.dataset_name)
            except AttributeError, e:
                raise e
                print 'no col {0}'.format(t.observed_date)
                pass
        except NoSuchTableError:
            print 'no table {0}'.format(t.dataset_name) 
            pass
Exemple #11
0
 def curried_slug(name):
     if name is None:
         return None
     else:
         return slugify(unicode(name), delim=u'_')
Exemple #12
0
def submit_dataset():
    # Slightly dumb way to make sure that POSTs are only coming from
    # originating domain for the time being
    referer = request.headers.get('Referer')
    if referer:
        referer = urlparse(referer).netloc
        req_url = urlparse(request.url).netloc
        if referer != req_url:
            abort(401)
    else:
        abort(401)
    resp = {'status': 'ok', 'message': ''}
    status_code = 200
    errors = []
    post = request.form.get('data')
    if not post:
        try:
            post = request.form.keys()[0]
        except IndexError:
            resp['status'] = 'error'
            resp['message'] = 'Unable to decode POST data'
            status_code = 400
    if status_code == 200:
        post = json.loads(post)
        if post.get('view_url'):
            if post.get('socrata'):
                source_domain = urlparse(post['view_url']).netloc
                four_by_four = re.findall(r'/([a-z0-9]{4}-[a-z0-9]{4})',
                                          post['view_url'])[-1]
                view_url = 'http://%s/api/views/%s' % (source_domain,
                                                       four_by_four)
                dataset_info, errors, status_code = get_socrata_data_info(
                    view_url)
                source_url = '%s/rows.csv?accessType=DOWNLOAD' % view_url
            else:
                dataset_info = {
                    'attribution': '',
                    'description': '',
                }
                source_url = post['view_url']
                dataset_info['name'] = urlparse(source_url).path.split('/')[-1]
            if errors:
                resp['message'] = ', '.join([e for e in errors])
                resp['status'] = 'error'
                status_code = 400
            else:
                dataset_id = md5(source_url).hexdigest()
                md = session.query(MetaTable).get(dataset_id)
                if not md:
                    d = {
                        'dataset_name': slugify(dataset_info['name'],
                                                delim=u'_'),
                        'human_name': dataset_info['name'],
                        'attribution': dataset_info['attribution'],
                        'description': dataset_info['description'],
                        'source_url': source_url,
                        'source_url_hash': dataset_id,
                        'update_freq': post['update_frequency'],
                        'business_key': post['field_definitions']['id_field'],
                        'observed_date':
                        post['field_definitions']['date_field'],
                        'latitude': post['field_definitions'].get('latitude'),
                        'longitude':
                        post['field_definitions'].get('longitude'),
                        'location': post['field_definitions'].get('location')
                    }
                    if len(d['dataset_name']) > 49:
                        d['dataset_name'] = d['dataset_name'][:50]
                    md = MetaTable(**d)
                    session.add(md)
                    session.commit()
                add_dataset.delay(md.source_url_hash,
                                  data_types=post.get('data_types'))
                resp[
                    'message'] = 'Dataset %s submitted successfully' % dataset_info[
                        'name']
        else:
            resp['status'] = 'error'
            resp['message'] = 'Must provide a url where data can be downloaded'
            status_code = 400
    resp = make_response(json.dumps(resp, default=dthandler), status_code)
    resp.headers['Content-Type'] = 'application/json'
    return resp
Exemple #13
0
 def test_slugify(self):
     from plenario.utils.helpers import slugify
     self.assertEqual(slugify("A-Awef-Basdf-123"), "a_awef_basdf_123")
Exemple #14
0
 def curried_slug(name):
     if name is None:
         return None
     else:
         return slugify(str(name), delimiter='_')
Exemple #15
0
 def make_table_name(cls, human_name):
     return slugify(human_name)
Exemple #16
0
 def make_table_name(cls, human_name):
     return slugify(human_name)
Exemple #17
0
from plenario.models import MetaTable
from sqlalchemy.exc import NoSuchTableError
from plenario.utils.helpers import slugify
from plenario.utils.etl import PlenarioETL
import traceback

if __name__ == "__main__":
    them = session.query(MetaTable).all()
    meta = MetaData()
    for t in them:
        try:
            table = Table('dat_{0}'.format(t.dataset_name),
                          meta,
                          autoload=True,
                          autoload_with=app_engine,
                          keep_existing=True)
            try:
                date_col = getattr(table.c, slugify(t.observed_date))
                if type(date_col.type) == DATE:
                    e = PlenarioETL(t.as_dict())
                    e._get_or_create_data_table()
                    e._add_weather_info()
                    print 'added weather for {0}'.format(t.dataset_name)
            except AttributeError, e:
                raise e
                print 'no col {0}'.format(t.observed_date)
                pass
        except NoSuchTableError:
            print 'no table {0}'.format(t.dataset_name)
            pass