def build(self) -> pdp.PdPipeline: """ Builds the pipeline based on stages found in `config.yaml`. """ self.pipeline = pdp.PdPipeline([stage for stage in self.stages]) self.built = True return self.pipeline
def _original_code(): start = time.time() salesdata = pd.read_csv("processed_salesdata.csv") pline = pdp.PdPipeline([ pdp.Schematize(COLUMNS), pdp.ApplyByCols("category_group", lambda x: "tops" if x == "tops" else "other"), pdp.ApplyByCols(["date", "shelf_date", "end_of_season"], pd.to_datetime), pdp.ApplyToRows(lambda row: pd.Series({ "standard_amount": row["original_price"] * row["sales"], "sales_discount": 0 if (row["original_price"] * row["sales"] <= 0) else row[ "sales_amount"] / ((row["original_price"] * row["sales"])), "week": int(row["date"].strftime('%W')), "days_on_counter": (row["date"] - row["shelf_date"]) / np.timedelta64(1, 'D'), "life_cycle": (row["end_of_season"] - row["shelf_date"]) / (np.timedelta64(1, 'D')), "C1": 1 if row["category_group"] == "tops" else 0, "C2": 1 if row["category_group"] == "other" else 0, "sales": 0 if row["sales"] < 0 else row["sales"], "passenger_flow": 0 if row["passenger_flow"] < 0 else (row["passenger_flow"]), "plus_purchase": 0 if row["plus_purchase"] < 0 else (row["plus_purchase"]), })), pdp.AdHocStage( lambda df: df[df["days_on_counter"] <= df["life_cycle"]]), pdp.ColDrop("activity_level") ]) salesdata = pline.apply(salesdata, verbose=True, exraise=True) salesdata_cumitems = salesdata[[ "SKC", "date", "sales", "passenger_flow", "plus_purchase" ]].sort_values(by=["SKC", "date"]).groupby(['SKC']).cumsum() salesdata_cumitems.columns = [ "total_sales", "total_passenger_flow", "total_plus_purchase" ] salesdata["total_sales"] = salesdata_cumitems["total_sales"] salesdata["total_passenger_flow"] = salesdata_cumitems[ "total_passenger_flow"] salesdata["total_plus_purchase"] = salesdata_cumitems[ "total_plus_purchase"] print("consumed time(s)=", time.time() - start)
# In[8]: import pdpipe as pdp#导入pdpipe包 # In[13]: #创建pdp.pdppipeline ,其中pdp.ColDrop和pdp.ApplyByCols和pdp.Rowdrop都是pdpipe中常用的API,是固定写法,有固定的语法格式 first_pipeline = pdp.PdPipeline( [pdp.ColDrop("original_title"),#ColDrop用于对指定单个或多个列进行丢弃 pdp.ApplyByCols(columns=['title'], func=lambda x: x.lower()),#对每一列进行应用 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),#对行进行删除 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), result_columns=['genres_num']), pdp.RowDrop({'genres_num': lambda x: x <= 5})] ) # In[19]: data1=first_pipeline(data, verbose=True).reset_index(drop=True) pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}), pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), re # In[20]:
!gsutil -m cp -r gs://{bucket_name}/* /content/drive/My\ Drive/CoronaTweets/ base_db_folder = '/content/drive/My Drive/CoronaTweets' tweet_db_paths = [ # incomplete data - '/corona_tweets_1M.db/corona_tweets_1M.db', # 27.02.2020 10:36 01.03.2020 18:24 1578957 # malformed - '/corona_tweets_2M_2/corona_tweets_2M_2.db', # 02.03.2020 17:27 07.03.2020 4:57 2268665 '/corona_tweets_3M/tweets.db', # 07.03.2020 5:06 14.03.2020 4:46 7472368 '/corona_tweets_1M/tweets.db', # 14.03.2020 5:23 15.03.2020 3:16 1903768 '/corona_tweets_2M_3/tweets.db', # 15.03.2020 3:28 16.03.2020 4:31 2081576 '/corona_tweets_1M_2/tweets.db', # 16.03.2020 4:38 17.03.2020 3:08 1889781 '/corona_tweets_2L/tweets.db' # 17.03.2020 3:12 17.03.2020 6:10 280304 ] pipeline = pdp.PdPipeline([ pdp.ColRename({'unix': 'tweet_date'}), pdp.ApplyByCols('sentiment', is_positive, 'is_positive', drop=False), pdp.ApplyByCols('sentiment', is_negative, 'is_negative', drop=False), pdp.ApplyByCols('sentiment', is_neutral, 'is_neutral', drop=False), ]) tweets_df = pd.DataFrame() for tweets_db in tweet_db_paths: full_tweet_db_path = base_db_folder + tweets_db print(dt.datetime.now(), "Processing started: ", full_tweet_db_path) conn = sqlite3.connect(full_tweet_db_path) c = conn.cursor() df_pie = pd.read_sql("SELECT * FROM sentiment", conn) df_pie['unix'] = pd.to_datetime(df_pie['unix'], unit='ms').dt.date # cast to date df = pipeline.apply(df_pie).sort_values(by=['tweet_date']) if df.shape[0] < 1: