def process_weather(station, field_to_process): db_hook = SqliteHook(conn_name_attr='sqlite_default') weather_select = (f'select record_date, {field_to_process} ' f'from weather where station_id={station} ' f'order by record_date;') data = db_hook.get_pandas_df(weather_select) average = data.rolling( 3, center=True).mean().rename(columns={field_to_process: 'average'}) data = data.merge(average, left_index=True, right_index=True) del average weather_update = """ update weather set average = ? where station_id=? and record_date=?; """ # iteration over data is used like a hack to avoid using either DataFrame.itertuples(), iteritems() or iterrows(), # which may be a bottleneck in case if number of rows is more then several thousands data.apply(lambda row: db_hook.run(weather_update, parameters=(row['average'], station, row['record_date'])), axis=1)
def getdf(): import os.path h = SqliteHook(conn_id) df = h.get_pandas_df( "SELECT A.DocumentNo, A.FullName, A.Device, A.Country, B.OrderId, B.DocumentNo, B.OrderDate, B.CatalogId,C.CatalogId, C.ProductId, C.CUID, D.ProductId, D.ProductName, D.CUID FROM CUSTOMERS as A inner join ORDERS as B on A.DocumentNo = B.DocumentNo inner join CATALOG as C on C.CatalogId = B.CatalogId inner join PRODUCTS AS D ON C.CUID = D.CUID" ) print(df) df.to_csv(os.path.join(BASE_DIR, "SourceData.txt"), index=False)
def predict(classifier, **context): """ Makes predictions for a model created by the given classifier and returns its stores the results in the mct_talks table. """ # Load model model = _load_model(classifier.__name__) # Load data db = SqliteHook() df = db.get_pandas_df('select * from mct_talks') # Make predictions df['Conference'] = model.predict(df['Title'].tolist()) # Save predictions with db.get_conn() as conn: df.to_sql('mct_talks', con=conn, if_exists='replace')
def split_data(**context): """ Splits the sample data (i.e. research_papers) into training and test sets and stores them in the Sqlite DB. """ # Load full dataset db = SqliteHook() df = db.get_pandas_df('select * from research_papers') # Create train/test split train, _ = train_test_split(df.index, test_size=0.33, stratify=df['Conference'], random_state=42) # Save training and test data in separate tables with db.get_conn() as conn: df.iloc[train].to_sql('training_data', con=conn, if_exists='replace') df.drop(train).to_sql('test_data', con=conn, if_exists='replace')