def environment(selected_dataset): if selected_dataset is None: if 'SELECTED_DATASET' in session: selected_dataset = session['SELECTED_DATASET'] if selected_dataset is None: return redirect(url_for('geoguide.upload')) dataset = Dataset.query.filter_by(filename=selected_dataset).first_or_404() df = pd.read_csv(datasets.path(dataset.filename)) vm = {} vm['dataset_headers'] = list(df.select_dtypes(include=[np.number]).columns) vm['dataset_headers'] = [ c for c in vm['dataset_headers'] if 'latitude' not in c and 'longitude' not in c and 'id' not in c and not df[c].isnull().any() and df[c].unique().shape[0] > 3 ] vm['dataset_json'] = json.dumps({ 'filename': dataset.filename, 'latitude_attr': dataset.latitude_attr, 'longitude_attr': dataset.longitude_attr, 'indexed': (dataset.indexed_at is not None), 'attributes': [ dict(description=attr.description, visible=attr.visible, type=dict(value=attr.type.value, description=attr.type.name)) for attr in dataset.attributes ], 'headers': vm['dataset_headers'], }) vm['dataset_url'] = datasets.url(dataset.filename) return render_template('geoguide/environment.html', **vm)
def save_as_sql(dataset, visible_attributes): engine = create_engine(SQLALCHEMY_DATABASE_URI) dataset_id = dataset.id datetime_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.datetime ] original_csv_path = datasets.path(dataset.filename) csv_path = '{}.normalized.csv'.format(original_csv_path.rsplit('.', 1)[0]) table_name = 'datasets.' + dataset.filename.rsplit('.', 1)[0] is_first = True visible_attributes = [ slugify(attr, separator='_') for attr in visible_attributes ] dataset.latitude_attr = slugify(dataset.latitude_attr, separator='_') dataset.longitude_attr = slugify(dataset.longitude_attr, separator='_') db.session.add(dataset) db.session.commit() for df in pd.read_csv(original_csv_path, parse_dates=datetime_columns, infer_datetime_format=True, chunksize=CHUNKSIZE): df.rename(columns=lambda c: slugify(c, separator='_'), inplace=True) df = df[(df[dataset.latitude_attr] != 0) & (df[dataset.longitude_attr] != 0)] df.to_csv(csv_path, index_label='geoguide_id', header=is_first, mode='a') df['geom'] = df.apply(lambda r: WKTElement('POINT({} {})'.format( r[dataset.longitude_attr], r[dataset.latitude_attr])), axis=1) df.to_sql(table_name, engine, if_exists='append', index_label='geoguide_id', chunksize=CHUNKSIZE, dtype={'geom': Geometry('POINT')}) is_first = False os.remove(original_csv_path) shutil.move(csv_path, original_csv_path) guess_attributes_types(dataset, visible_attributes) Thread(target=lambda: index_dataset_from_sql(dataset_id)).start()
def upload(): if request.method == 'POST' and 'datasetInputFile' in request.files: try: uploaded = request.files['datasetInputFile'] filename = datasets.save(uploaded, name='{}.'.format(uuid4())) title = request.form['titleInputText'] number_of_rows = None if request.form['numberRowsInputNumber']: number_of_rows = int(request.form['numberRowsInputNumber']) latitude_attr = None if request.form['latitudeAttrSelect']: latitude_attr = request.form['latitudeAttrSelect'] longitude_attr = None if request.form['longitudeAttrSelect']: longitude_attr = request.form['longitudeAttrSelect'] datetime_attr = [] if request.form['datetimeAttrInputText']: datetime_attr = [ attr.strip() for attr in request.form['datetimeAttrInputText'].split(',') ] dataset = Dataset(title, filename, number_of_rows, latitude_attr, longitude_attr) db.session.add(dataset) db.session.commit() for attr in datetime_attr: attribute = Attribute(attr, AttributeType.datetime, dataset.id) db.session.add(attribute) db.session.commit() session['SELECTED_DATASET'] = filename save_as_sql(dataset, request.form.getlist('selectionAttrInputCheckbox') ) if USE_SQL else save_as_hdf(dataset) return redirect(url_for('geoguide.environment')) except UploadNotAllowed: flash('This file is not allowed.', 'error') return redirect(url_for('geoguide.upload')) vm = {} vm['datasets'] = current_user.datasets needs_reload = False for dataset in vm['datasets']: if not os.path.isfile(datasets.path(dataset.filename)): db.session.delete(dataset) needs_reload = True if needs_reload: db.session.commit() vm['datasets'] = current_user.datasets return render_template('geoguide/upload.html', **vm)
def guess_attributes_types(dataset, visible_attributes=[]): df = pd.read_csv(datasets.path(dataset.filename)) numberic_attributes = list(df.select_dtypes(include=[np.number]).columns) string_attributes = list(df.select_dtypes(include=[object]).columns) number_attributes = [ c for c in numberic_attributes if not is_latlng_attribute(c) and 'id' not in c and df[c].unique().shape[0] > 3 ] categorical_number_attributes = [ c for c in numberic_attributes if not is_latlng_attribute(c) and 'id' not in c and df[c].unique().shape[0] <= 3 ] categorical_text_attributes = [ c for c in string_attributes if df[c].unique().shape[0] <= 10 ] text_attributes = [ c for c in string_attributes if c not in categorical_text_attributes ] for attr in number_attributes: attribute = Attribute(attr, AttributeType.number, dataset.id, attr in visible_attributes) db.session.add(attribute) for attr in categorical_number_attributes: attribute = Attribute(attr, AttributeType.categorical_number, dataset.id, attr in visible_attributes) db.session.add(attribute) for attr in categorical_text_attributes: attribute = Attribute(attr, AttributeType.categorical_text, dataset.id, attr in visible_attributes) db.session.add(attribute) for attr in text_attributes: attribute = Attribute(attr, AttributeType.text, dataset.id, attr in visible_attributes) db.session.add(attribute) db.session.commit()
def save_as_hdf(dataset): dataset_id = dataset.id datetime_columns = [ attr.description for attr in dataset.attributes if attr.type == AttributeType.datetime ] original_csv_path = datasets.path(dataset.filename) csv_path = '{}.normalized.csv'.format(original_csv_path.rsplit('.', 1)[0]) hdf_path = '{}.h5'.format(original_csv_path.rsplit('.', 1)[0]) is_first = True dataset.latitude_attr = slugify(dataset.latitude_attr, separator='_') dataset.longitude_attr = slugify(dataset.longitude_attr, separator='_') db.session.add(dataset) db.session.commit() store = pd.HDFStore(hdf_path) for df in pd.read_csv(original_csv_path, parse_dates=datetime_columns, infer_datetime_format=True, chunksize=CHUNKSIZE): df.rename(columns=lambda c: slugify(c, separator='_'), inplace=True) df = df[(df[dataset.latitude_attr] != 0) & (df[dataset.longitude_attr] != 0)] df.to_csv(csv_path, index_label='geoguide_id', header=is_first, mode='a') if is_first: store.put('data', df, format='table') else: store.append('data', df) is_first = False store.close() os.remove(original_csv_path) shutil.move(csv_path, original_csv_path) guess_attributes_types(dataset) Thread(target=lambda: index_dataset_from_hdf(dataset_id)).start()
def point_details(selected_dataset, index): dataset = Dataset.query.filter_by(filename=selected_dataset).first_or_404() df = pd.read_csv(datasets.path(dataset.filename)) return df.loc[index].to_json(), 200, {'Content-Type': 'application/json'}
def path_to_hdf(dataset): return '{}.h5'.format(datasets.path(dataset.filename).rsplit('.', 1)[0])