Ejemplo n.º 1
0
def environment(selected_dataset):
    if selected_dataset is None:
        if 'SELECTED_DATASET' in session:
            selected_dataset = session['SELECTED_DATASET']
    if selected_dataset is None:
        return redirect(url_for('geoguide.upload'))
    dataset = Dataset.query.filter_by(filename=selected_dataset).first_or_404()
    df = pd.read_csv(datasets.path(dataset.filename))
    vm = {}
    vm['dataset_headers'] = list(df.select_dtypes(include=[np.number]).columns)
    vm['dataset_headers'] = [
        c for c in vm['dataset_headers']
        if 'latitude' not in c and 'longitude' not in c and 'id' not in c
        and not df[c].isnull().any() and df[c].unique().shape[0] > 3
    ]
    vm['dataset_json'] = json.dumps({
        'filename':
        dataset.filename,
        'latitude_attr':
        dataset.latitude_attr,
        'longitude_attr':
        dataset.longitude_attr,
        'indexed': (dataset.indexed_at is not None),
        'attributes': [
            dict(description=attr.description,
                 visible=attr.visible,
                 type=dict(value=attr.type.value, description=attr.type.name))
            for attr in dataset.attributes
        ],
        'headers':
        vm['dataset_headers'],
    })
    vm['dataset_url'] = datasets.url(dataset.filename)
    return render_template('geoguide/environment.html', **vm)
Ejemplo n.º 2
0
def save_as_sql(dataset, visible_attributes):
    engine = create_engine(SQLALCHEMY_DATABASE_URI)

    dataset_id = dataset.id
    datetime_columns = [
        attr.description for attr in dataset.attributes
        if attr.type == AttributeType.datetime
    ]

    original_csv_path = datasets.path(dataset.filename)
    csv_path = '{}.normalized.csv'.format(original_csv_path.rsplit('.', 1)[0])
    table_name = 'datasets.' + dataset.filename.rsplit('.', 1)[0]
    is_first = True

    visible_attributes = [
        slugify(attr, separator='_') for attr in visible_attributes
    ]
    dataset.latitude_attr = slugify(dataset.latitude_attr, separator='_')
    dataset.longitude_attr = slugify(dataset.longitude_attr, separator='_')
    db.session.add(dataset)
    db.session.commit()

    for df in pd.read_csv(original_csv_path,
                          parse_dates=datetime_columns,
                          infer_datetime_format=True,
                          chunksize=CHUNKSIZE):
        df.rename(columns=lambda c: slugify(c, separator='_'), inplace=True)
        df = df[(df[dataset.latitude_attr] != 0)
                & (df[dataset.longitude_attr] != 0)]
        df.to_csv(csv_path,
                  index_label='geoguide_id',
                  header=is_first,
                  mode='a')
        df['geom'] = df.apply(lambda r: WKTElement('POINT({} {})'.format(
            r[dataset.longitude_attr], r[dataset.latitude_attr])),
                              axis=1)
        df.to_sql(table_name,
                  engine,
                  if_exists='append',
                  index_label='geoguide_id',
                  chunksize=CHUNKSIZE,
                  dtype={'geom': Geometry('POINT')})
        is_first = False

    os.remove(original_csv_path)
    shutil.move(csv_path, original_csv_path)

    guess_attributes_types(dataset, visible_attributes)

    Thread(target=lambda: index_dataset_from_sql(dataset_id)).start()
Ejemplo n.º 3
0
def upload():
    if request.method == 'POST' and 'datasetInputFile' in request.files:
        try:
            uploaded = request.files['datasetInputFile']
            filename = datasets.save(uploaded, name='{}.'.format(uuid4()))
            title = request.form['titleInputText']
            number_of_rows = None
            if request.form['numberRowsInputNumber']:
                number_of_rows = int(request.form['numberRowsInputNumber'])
            latitude_attr = None
            if request.form['latitudeAttrSelect']:
                latitude_attr = request.form['latitudeAttrSelect']
            longitude_attr = None
            if request.form['longitudeAttrSelect']:
                longitude_attr = request.form['longitudeAttrSelect']
            datetime_attr = []
            if request.form['datetimeAttrInputText']:
                datetime_attr = [
                    attr.strip() for attr in
                    request.form['datetimeAttrInputText'].split(',')
                ]
            dataset = Dataset(title, filename, number_of_rows, latitude_attr,
                              longitude_attr)
            db.session.add(dataset)
            db.session.commit()
            for attr in datetime_attr:
                attribute = Attribute(attr, AttributeType.datetime, dataset.id)
                db.session.add(attribute)
                db.session.commit()
            session['SELECTED_DATASET'] = filename
            save_as_sql(dataset,
                        request.form.getlist('selectionAttrInputCheckbox')
                        ) if USE_SQL else save_as_hdf(dataset)
            return redirect(url_for('geoguide.environment'))
        except UploadNotAllowed:
            flash('This file is not allowed.', 'error')
        return redirect(url_for('geoguide.upload'))
    vm = {}
    vm['datasets'] = current_user.datasets
    needs_reload = False
    for dataset in vm['datasets']:
        if not os.path.isfile(datasets.path(dataset.filename)):
            db.session.delete(dataset)
            needs_reload = True
    if needs_reload:
        db.session.commit()
        vm['datasets'] = current_user.datasets
    return render_template('geoguide/upload.html', **vm)
Ejemplo n.º 4
0
def guess_attributes_types(dataset, visible_attributes=[]):
    df = pd.read_csv(datasets.path(dataset.filename))
    numberic_attributes = list(df.select_dtypes(include=[np.number]).columns)
    string_attributes = list(df.select_dtypes(include=[object]).columns)

    number_attributes = [
        c for c in numberic_attributes if not is_latlng_attribute(c)
        and 'id' not in c and df[c].unique().shape[0] > 3
    ]

    categorical_number_attributes = [
        c for c in numberic_attributes if not is_latlng_attribute(c)
        and 'id' not in c and df[c].unique().shape[0] <= 3
    ]

    categorical_text_attributes = [
        c for c in string_attributes if df[c].unique().shape[0] <= 10
    ]

    text_attributes = [
        c for c in string_attributes if c not in categorical_text_attributes
    ]

    for attr in number_attributes:
        attribute = Attribute(attr, AttributeType.number, dataset.id, attr
                              in visible_attributes)
        db.session.add(attribute)

    for attr in categorical_number_attributes:
        attribute = Attribute(attr, AttributeType.categorical_number,
                              dataset.id, attr in visible_attributes)
        db.session.add(attribute)

    for attr in categorical_text_attributes:
        attribute = Attribute(attr, AttributeType.categorical_text, dataset.id,
                              attr in visible_attributes)
        db.session.add(attribute)

    for attr in text_attributes:
        attribute = Attribute(attr, AttributeType.text, dataset.id, attr
                              in visible_attributes)
        db.session.add(attribute)

    db.session.commit()
Ejemplo n.º 5
0
def save_as_hdf(dataset):
    dataset_id = dataset.id
    datetime_columns = [
        attr.description for attr in dataset.attributes
        if attr.type == AttributeType.datetime
    ]
    original_csv_path = datasets.path(dataset.filename)
    csv_path = '{}.normalized.csv'.format(original_csv_path.rsplit('.', 1)[0])
    hdf_path = '{}.h5'.format(original_csv_path.rsplit('.', 1)[0])
    is_first = True

    dataset.latitude_attr = slugify(dataset.latitude_attr, separator='_')
    dataset.longitude_attr = slugify(dataset.longitude_attr, separator='_')
    db.session.add(dataset)
    db.session.commit()

    store = pd.HDFStore(hdf_path)
    for df in pd.read_csv(original_csv_path,
                          parse_dates=datetime_columns,
                          infer_datetime_format=True,
                          chunksize=CHUNKSIZE):
        df.rename(columns=lambda c: slugify(c, separator='_'), inplace=True)
        df = df[(df[dataset.latitude_attr] != 0)
                & (df[dataset.longitude_attr] != 0)]
        df.to_csv(csv_path,
                  index_label='geoguide_id',
                  header=is_first,
                  mode='a')
        if is_first:
            store.put('data', df, format='table')
        else:
            store.append('data', df)
        is_first = False

    store.close()
    os.remove(original_csv_path)
    shutil.move(csv_path, original_csv_path)

    guess_attributes_types(dataset)

    Thread(target=lambda: index_dataset_from_hdf(dataset_id)).start()
Ejemplo n.º 6
0
def point_details(selected_dataset, index):
    dataset = Dataset.query.filter_by(filename=selected_dataset).first_or_404()
    df = pd.read_csv(datasets.path(dataset.filename))
    return df.loc[index].to_json(), 200, {'Content-Type': 'application/json'}
Ejemplo n.º 7
0
def path_to_hdf(dataset):
    return '{}.h5'.format(datasets.path(dataset.filename).rsplit('.', 1)[0])