예제 #1
0
def process_weather(station, field_to_process):
    db_hook = SqliteHook(conn_name_attr='sqlite_default')

    weather_select = (f'select record_date, {field_to_process} '
                      f'from weather where station_id={station} '
                      f'order by record_date;')

    data = db_hook.get_pandas_df(weather_select)
    average = data.rolling(
        3, center=True).mean().rename(columns={field_to_process: 'average'})
    data = data.merge(average, left_index=True, right_index=True)

    del average

    weather_update = """
    update weather
    set average = ?
    where station_id=? and record_date=?;
    """

    # iteration over data is used like a hack to avoid using either DataFrame.itertuples(), iteritems() or iterrows(),
    # which may be a bottleneck in case if number of rows is more then several thousands

    data.apply(lambda row: db_hook.run(weather_update,
                                       parameters=(row['average'], station,
                                                   row['record_date'])),
               axis=1)
예제 #2
0
def getdf():
    import os.path
    h = SqliteHook(conn_id)
    df = h.get_pandas_df(
        "SELECT A.DocumentNo, A.FullName, A.Device, A.Country, B.OrderId, B.DocumentNo, B.OrderDate, B.CatalogId,C.CatalogId, C.ProductId, C.CUID, D.ProductId, D.ProductName, D.CUID FROM CUSTOMERS as A inner join ORDERS as B on A.DocumentNo = B.DocumentNo inner join CATALOG as C on C.CatalogId = B.CatalogId inner join PRODUCTS AS D ON C.CUID = D.CUID"
    )
    print(df)
    df.to_csv(os.path.join(BASE_DIR, "SourceData.txt"), index=False)
예제 #3
0
def predict(classifier, **context):
    """
    Makes predictions for a model created by the given classifier and returns its
    stores the results in the mct_talks table.
    """
    # Load model
    model = _load_model(classifier.__name__)

    # Load data
    db = SqliteHook()
    df = db.get_pandas_df('select * from mct_talks')

    # Make predictions
    df['Conference'] = model.predict(df['Title'].tolist())

    # Save predictions
    with db.get_conn() as conn:
        df.to_sql('mct_talks', con=conn, if_exists='replace')
예제 #4
0
def split_data(**context):
    """
    Splits the sample data (i.e. research_papers) into training and test sets
    and stores them in the Sqlite DB.
    """
    # Load full dataset
    db = SqliteHook()
    df = db.get_pandas_df('select * from research_papers')

    # Create train/test split
    train, _ = train_test_split(df.index,
                                test_size=0.33,
                                stratify=df['Conference'],
                                random_state=42)

    # Save training and test data in separate tables
    with db.get_conn() as conn:
        df.iloc[train].to_sql('training_data', con=conn, if_exists='replace')
        df.drop(train).to_sql('test_data', con=conn, if_exists='replace')