コード例 #1
0
    def build(self) -> pdp.PdPipeline:
        """
        Builds the pipeline based on stages found in `config.yaml`.
        """

        self.pipeline = pdp.PdPipeline([stage for stage in self.stages])
        self.built = True
        return self.pipeline
コード例 #2
0
ファイル: test_issue_29.py プロジェクト: nononome/pdpipe
def _original_code():
    start = time.time()
    salesdata = pd.read_csv("processed_salesdata.csv")
    pline = pdp.PdPipeline([
        pdp.Schematize(COLUMNS),
        pdp.ApplyByCols("category_group", lambda x: "tops"
                        if x == "tops" else "other"),
        pdp.ApplyByCols(["date", "shelf_date", "end_of_season"],
                        pd.to_datetime),
        pdp.ApplyToRows(lambda row: pd.Series({
            "standard_amount":
            row["original_price"] * row["sales"],
            "sales_discount":
            0 if (row["original_price"] * row["sales"] <= 0) else row[
                "sales_amount"] / ((row["original_price"] * row["sales"])),
            "week":
            int(row["date"].strftime('%W')),
            "days_on_counter":
            (row["date"] - row["shelf_date"]) / np.timedelta64(1, 'D'),
            "life_cycle": (row["end_of_season"] - row["shelf_date"]) /
            (np.timedelta64(1, 'D')),
            "C1":
            1 if row["category_group"] == "tops" else 0,
            "C2":
            1 if row["category_group"] == "other" else 0,
            "sales":
            0 if row["sales"] < 0 else row["sales"],
            "passenger_flow":
            0 if row["passenger_flow"] < 0 else (row["passenger_flow"]),
            "plus_purchase":
            0 if row["plus_purchase"] < 0 else (row["plus_purchase"]),
        })),
        pdp.AdHocStage(
            lambda df: df[df["days_on_counter"] <= df["life_cycle"]]),
        pdp.ColDrop("activity_level")
    ])
    salesdata = pline.apply(salesdata, verbose=True, exraise=True)

    salesdata_cumitems = salesdata[[
        "SKC", "date", "sales", "passenger_flow", "plus_purchase"
    ]].sort_values(by=["SKC", "date"]).groupby(['SKC']).cumsum()
    salesdata_cumitems.columns = [
        "total_sales", "total_passenger_flow", "total_plus_purchase"
    ]
    salesdata["total_sales"] = salesdata_cumitems["total_sales"]
    salesdata["total_passenger_flow"] = salesdata_cumitems[
        "total_passenger_flow"]
    salesdata["total_plus_purchase"] = salesdata_cumitems[
        "total_plus_purchase"]
    print("consumed time(s)=", time.time() - start)
コード例 #3
0

# In[8]:


import pdpipe as pdp#导入pdpipe包


# In[13]:


#创建pdp.pdppipeline ,其中pdp.ColDrop和pdp.ApplyByCols和pdp.Rowdrop都是pdpipe中常用的API,是固定写法,有固定的语法格式
first_pipeline = pdp.PdPipeline(
[pdp.ColDrop("original_title"),#ColDrop用于对指定单个或多个列进行丢弃
 pdp.ApplyByCols(columns=['title'], func=lambda x: x.lower()),#对每一列进行应用
 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),#对行进行删除
 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), result_columns=['genres_num']),
 pdp.RowDrop({'genres_num': lambda x: x <= 5})]
)


# In[19]:


data1=first_pipeline(data, verbose=True).reset_index(drop=True)
 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),
 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), re


# In[20]:
コード例 #4
0
!gsutil -m cp -r gs://{bucket_name}/* /content/drive/My\ Drive/CoronaTweets/

base_db_folder = '/content/drive/My Drive/CoronaTweets'
tweet_db_paths = [
    # incomplete data - '/corona_tweets_1M.db/corona_tweets_1M.db',   # 27.02.2020 10:36 01.03.2020 18:24 1578957
    # malformed - '/corona_tweets_2M_2/corona_tweets_2M_2.db',  # 02.03.2020 17:27	07.03.2020 4:57	2268665
    '/corona_tweets_3M/tweets.db',  # 07.03.2020 5:06	14.03.2020 4:46	7472368
    '/corona_tweets_1M/tweets.db',  # 14.03.2020 5:23	15.03.2020 3:16	1903768
    '/corona_tweets_2M_3/tweets.db',  # 15.03.2020 3:28	16.03.2020 4:31	2081576
    '/corona_tweets_1M_2/tweets.db',  # 16.03.2020 4:38	17.03.2020 3:08	1889781
    '/corona_tweets_2L/tweets.db'  # 17.03.2020 3:12	17.03.2020 6:10	280304
]

pipeline = pdp.PdPipeline([
    pdp.ColRename({'unix': 'tweet_date'}),
    pdp.ApplyByCols('sentiment', is_positive, 'is_positive', drop=False),
    pdp.ApplyByCols('sentiment', is_negative, 'is_negative', drop=False),
    pdp.ApplyByCols('sentiment', is_neutral, 'is_neutral', drop=False),
])

tweets_df = pd.DataFrame()

for tweets_db in tweet_db_paths:
    full_tweet_db_path = base_db_folder + tweets_db
    print(dt.datetime.now(), "Processing started: ", full_tweet_db_path)
    conn = sqlite3.connect(full_tweet_db_path)
    c = conn.cursor()
    df_pie = pd.read_sql("SELECT * FROM sentiment", conn)
    df_pie['unix'] = pd.to_datetime(df_pie['unix'], unit='ms').dt.date  # cast to date
    df = pipeline.apply(df_pie).sort_values(by=['tweet_date'])

    if df.shape[0] < 1: