Python colnamesの例、stream2segment.io.db.pd_sql_utils.colnames Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_db.py プロジェクト: rizac/stream2segment

    def test_pd_to_sql(self):
        dc = models.DataCenter(station_query_url='awergedfbvdbfnhfsnsbstndggf ',
                               dataselect_query_url='edf')
        self.session.add(dc)
        self.session.commit()
        
        id = 'abcdefghilmnopq'
        utcnow = datetime.datetime.utcnow()
        e = models.Station(id="a.b", network='a', datacenter_id=dc.id, station='b', latitude=56, longitude=78)
        self.session.add(e)
        self.session.commit()

        stacolnames = list(colnames(models.Station))
        df = pd.DataFrame(columns=stacolnames, data=[[None for _ in stacolnames]])
        df.loc[0, 'id'] = id + '.j'
        df.loc[0, 'network'] = id
        df.loc[0, 'datacenter_id'] = dc.id
        df.loc[0, 'station'] = 'j'
        df.loc[0, 'latitude'] = 43
        df.loc[0, 'longitude'] = 56.7
        # df.loc[0, 'datacenter_id'] = dc.id
        
        df.to_sql(e.__table__.name, self.engine, if_exists='append', index=False)
         
        # same id as above, but check that data exist (i.e., error)
        df.loc[0, 'id'] = id
        with pytest.raises(IntegrityError):
            df.to_sql(e.__table__.name, self.engine, if_exists='append', index=False)

コード例 #2

0

ファイルを表示

ファイル: test_db.py プロジェクト: rizac/stream2segment

    def tst_get_cols(self, seg):
        
        clen = len(seg.__class__.__table__.columns)
        
        cols = seg.__table__.columns
        c = list(colnames(seg.__class__))  # or models.Segment
        assert len(c) == clen

        c = list(colnames(seg.__class__, pkey=False))
        assert len(c) == clen - 1

        c = list(colnames(seg.__class__, pkey=True))
        assert len(c) == 1

        c = list(colnames(seg.__class__, fkey=False))
        assert len(c) == clen - 4

        c = list(colnames(seg.__class__, fkey=True))
        assert len(c) == 4

        c = list(colnames(seg.__class__, nullable=True))
        assert len(c) == 0

        c = list(colnames(seg.__class__, nullable=False))
        assert len(c) == clen

コード例 #3

0

ファイルを表示

ファイル: utils.py プロジェクト: rizac/stream2segment

def rename_columns(query_df, query_type):
    """Renames the columns of `query_df` according to the "standard" expected column names given by
    query_type, so that IO operation with the database are not suffering naming mismatch (e.g., non
    matching cases). If the number of columns of `query_df` does not match the number of expected
    columns, a ValueError is raised. The assumption is that any datacenter returns the *same* column
    in the *same* position, as guessing columns by name might be tricky (there is not only a problem
    of case sensitivity, but also of e.g. "#Network" vs "network". <-Ref needed!)
    :param query_df: the DataFrame resulting from an fdsn query, either events station
    (level=station) or station (level=channel)
    :param query_type: a string denoting the query type whereby `query_df` has been generated and
    determining the expected column names, so that `query_df` columns will be renamed accordingly.
    Possible values are "event", "station" (for a station query with parameter level=station) or
    "channel" (for a station query with parameter level=channel)
    :return: a new DataFrame with columns correctly renamed
    """
    if empty(query_df):
        return query_df

    Event, Station, Channel = models.Event, models.Station, models.Channel
    if query_type.lower() == "event" or query_type.lower() == "events":
        columns = list(colnames(Event))
    elif query_type.lower() == "station" or query_type.lower() == "stations":
        # these are the query_df columns for a station (level=station) query:
        #  #Network|Station|Latitude|Longitude|Elevation|SiteName|StartTime|EndTime
        # set this table columns mapping (by name, so we can safely add any new column at any
        # index):
        columns = [Station.network.key, Station.station.key, Station.latitude.key,
                   Station.longitude.key, Station.elevation.key, Station.site_name.key,
                   Station.start_time.key, Station.end_time.key]
    elif query_type.lower() == "channel" or query_type.lower() == "channels":
        # these are the query_df expected columns for a station (level=channel) query:
        #  #Network|Station|Location|Channel|Latitude|Longitude|Elevation|Depth|Azimuth|Dip|
        #  SensorDescription|Scale|ScaleFreq|ScaleUnits|SampleRate|StartTime|EndTime
        # Some of them are for the Channel table, so select them:
        columns = [Station.network.key, Station.station.key, Channel.location.key,
                   Channel.channel.key, Station.latitude.key, Station.longitude.key,
                   Station.elevation.key, Channel.depth.key,
                   Channel.azimuth.key, Channel.dip.key, Channel.sensor_description.key,
                   Channel.scale.key, Channel.scale_freq.key, Channel.scale_units.key,
                   Channel.sample_rate.key, Station.start_time.key, Station.end_time.key]
    else:
        raise ValueError("Invalid fdsn_model: supply Events, Station or Channel class")

    oldcolumns = query_df.columns.tolist()
    if len(oldcolumns) != len(columns):
        raise ValueError("Mismatching number of columns in '%s' query.\nExpected:\n%s\nFound:\n%s" %
                         (query_type.lower(), str(oldcolumns), str(columns)))

    return query_df.rename(columns={cold: cnew for cold, cnew in zip(oldcolumns, columns)})

コード例 #4

0

ファイルを表示

ファイル: test_db.py プロジェクト: rizac/stream2segment

    def test_harmonize_columns(self):

        id = 'abcdefghilmnopq'
        utcnow = datetime.datetime.utcnow()

        eventcolnames = list(colnames(models.Event))
        df = pd.DataFrame(columns=eventcolnames,
                          data=[[None for _ in eventcolnames]])


        # add a column which is NOT on the table:
        colx = 'iassdvgdhrnjynhnt_________'
        df.insert(0, colx, 1)

        cnames, df2 = _harmonize_columns(models.Event, df)

        # colx is not part of the Event model:
        assert colx not in cnames

        # df2 has been modified in place actually:
        assert (df.dtypes == df2.dtypes).all()

        df2types = df2.dtypes
        # checking if a class is datetime is cumbersome innumpy. See here:
        # http://stackoverflow.com/questions/23063362/consistent-way-to-check-if-an-np-array-is-datetime-like
        # so we do:
        assert 'datetime64' in str(df2types[models.Event.time.key])
        # other stuff works fine with normal check:
        assert df2types[models.Event.latitude.key] == np.float64
        assert df2types[models.Event.longitude.key] == np.float64
        assert df2types[models.Event.depth_km.key] == np.float64
        assert df2types[models.Event.magnitude.key] == np.float64
        # assert also other fields are objects (not all of them, just two):
        assert df2types[models.Event.event_location_name.key] == object
        assert df2types[models.Event.author.key] == object
        
        
        df3 = harmonize_columns(models.Event, df2)[cnames] # this calls _harmonize_columns above
        
        assert colx not in df3.columns
        
        
        
        
        
        # now try to see with invalid values for floats
        evcolnames = list(colnames(models.Event))
        dfx = pd.DataFrame(columns=evcolnames,
                          data=[["a" for _ in evcolnames]])
        
        _harmonize_columns(models.Event, dfx)
        
        # df2 and dfx should have the same dtypes:
        assert (dfx.dtypes == df2[cnames].dtypes).all()
        
        # fast check: datetimes and a float field
        assert pd.isnull(dfx.loc[0, models.Event.time.key])
        assert pd.isnull(dfx.loc[0, models.Event.longitude.key])
        
        # check harmonize rows: invalid rows should be removed (we have 1 invalid row)
        oldlen = len(dfx)
        dfrows = harmonize_rows(models.Event, dfx, inplace=False)
        assert len(dfrows) ==0 and len(dfx) == oldlen
        # check inplace = True
        dfrows = harmonize_rows(models.Event, dfx, inplace=True)
        assert len(dfrows) == len(dfx) == 0
        
        # go on by checking harmonize_columns. FIXME: what are we doing here below?
        dfx = pd.DataFrame(columns=evcolnames,
                          data=[["a" for _ in evcolnames]])
        
        dfx.loc[0, models.Event.time.key] = utcnow
        dfx.loc[0, models.Event.latitude.key] = 6.5
        
        _harmonize_columns(models.Event, dfx)
        # fast check: datetimes and a float field
        assert pd.notnull(dfx.loc[0, models.Event.time.key])
        assert pd.isnull(dfx.loc[0, models.Event.longitude.key])
        assert pd.notnull(dfx.loc[0, models.Event.latitude.key])
        
        
        
        
        
        g = 9

コード例 #5

0

ファイルを表示

ファイル: test_db.py プロジェクト: rizac/stream2segment

    def testSqlAlchemy(self):
        #run_cols = models.Run.__table__.columns.keys()
        #run_cols.remove('id')  # remove id (auto set in the model)
        #d = pd.DataFrame(columns=run_cols, data=[[None for _ in run_cols]])
        #records = d.to_dict('records') # records(index=False)
        # record = records[0]

        # pass a run_id without id and see if it's updated as utcnow:
        run_row = models.Run()
        assert run_row.id is None

        run_row = models.Run(id=None)
        assert run_row.id is None

        # test that methods of the base class work:
        cnames = list(colnames(run_row.__class__))
        assert len(cnames) > 0

        # test id is auto added:
        self.session.add_all([run_row])
        # self.session.flush()
        self.session.commit()
        assert run_row.id is not None

        # now pass a utcdatetime and see if we keep that value:
        utcnow = datetime.datetime.utcnow()
        run_row = models.Run(run_time=utcnow)
        assert run_row.run_time == utcnow
        self.session.add_all([run_row])
        # self.session.flush()
        self.session.commit()
        assert run_row.run_time == utcnow

        # test column names:
#         colz = run_row.get_col_names()
#         colz.remove('id')  # the primary key
#         d = pd.DataFrame(columns=colz, data=[[None for _ in colz]])
#         run_row._check_columns(d)

        # test types. string ints are parsed automatically? YES
        val = '6'
        e = models.Class(id=val)
        assert e.id != int(val)
        self.session.add(e)
        self.session.commit()
        assert e.id == int(val)

        # test types. string floats are parsed automatically as int? YES if INT
        # so this is NO:
        val = '5.2'
        e = models.Class(id=val)
        assert e.id != float(val)
        self.session.add(e)

        with pytest.raises(IntegrityError):
            self.session.commit()
        # necessary after failure? FIXME: check!
        self.session.rollback()
            
        # this is YES:
        val = '5.0'
        e = models.Class(id=val)
        assert e.id != int(float(val))
        self.session.add(e)
        self.session.commit()
        assert e.id == int(float(val))


        # test types. String floats are parsed automatically? YES
        val = '6.7'
        e = models.Event(id='abc', time=datetime.datetime.utcnow(),
                         latitude=val, longitude=78, magnitude=56, depth_km=45)
        assert e.latitude != float(val)
        self.session.add(e)
        self.session.commit()
        assert e.latitude == float(val)

        # create a datacenter WITHOUT the two fields stations and dataselect
        dc = models.DataCenter(station_query_url='abc')
        with pytest.raises(IntegrityError):
            self.session.add(dc)
            self.session.flush()
            
        self.session.rollback()
        
        # now add it properly:
        dc = models.DataCenter(station_query_url='abc', dataselect_query_url='edf')
        self.session.add(dc)
        self.session.flush()

        # test stations auto id (concat):
        # first test non-specified non-null fields datacenter_id (should reaise an IntegrityError)
        e = models.Station(network='abc', station='f')
        assert e.id == "abc.f"
        self.session.add(e)
        # we do not have specified all non-null fields:
        with pytest.raises(IntegrityError):
            self.session.commit()
        self.session.rollback()

        # now test auto id
        e = models.Station(network='abc', datacenter_id=dc.id, station='f', latitude='89.5', longitude='56')
        assert e.id == "abc.f"
        self.session.add(e)
        # we do not have specified all non-null fields:
        self.session.commit()
        assert e.id == "abc.f"

        # test unique constraints by changing only network
        sta = models.Station(network='a', datacenter_id=dc.id, station='f', latitude='89.5', longitude='56')
        self.session.add(sta)
        # we do not have specified all non-null fields:
        self.session.commit()
        assert sta.id == "a.f"

        # now re-add it. Unique constraint failed
        sta = models.Station(network='a', datacenter_id=dc.id, station='f', latitude='189.5', longitude='156')
        self.session.add(sta)
        with pytest.raises(IntegrityError):
            self.session.commit()
        self.session.rollback()

        # test stations channels relationship:
        sta = models.Station(network='ax', datacenter_id=dc.id, station='f', latitude='89.5', longitude='56')
        # write channels WITHOUT foreign key
        cha1 = models.Channel(location='l', channel='HHZ', sample_rate=56)
        cha2 = models.Channel(location='l', channel='HHN', sample_rate=12)
        # add channels to the stations.channels relationships
        sta.channels.append(cha1)
        sta.channels.append(cha2)
        # Now when adding and commiting we should see channels foreign keys updated according to
        # sta id:
        self.session.add(sta)
        self.session.add(cha1)
        self.session.add(cha2)
        self.session.commit()
        # foreign keys are auto updated!!! TEST IT:
        assert cha1.station_id == sta.id
        assert cha2.station_id == sta.id

        # now test the same with a station read from the database. We don't acutally need
        # a commit, flush is sufficient
        sta = self.session.query(models.Station).filter(models.Station.id == 'a.f').first()
        cha1 = models.Channel(location='l2', channel='HHZ', sample_rate=56)
        cha2 = models.Channel(location='l', channel='HHW', sample_rate=56)
        sta.channels.append(cha1)
        sta.channels.append(cha2)
        assert cha1.station_id != sta.id
        assert cha2.station_id != sta.id
        self.session.flush()
        # foreign keys are auto updated!!! TEST IT:
        assert cha1.station_id == sta.id
        assert cha2.station_id == sta.id
        
        k = self.session.query(models.Event).all()
        assert len(k) == 1