def preprocessing_walmart(dataset):# 2nd function definition processing dataset['Day'] = pd.to_datetime(dataset['Date']) #using pandas pipeline panda_pipe = pdp.ApplyByCols('Day',lambda x: (x.day//7)+1,'Week_no',drop = False) #converting given day into week of the month panda_pipe += pdp.ApplyByCols('Day',lambda x: x.month,'month',drop = False) #getting month from the date panda_pipe += pdp.ColDrop(['Date','Day']) dataset = panda_pipe(dataset) dataset['Lag2'] = dataset['Weekly_Sales'].shift(2) dataset['Lag3'] = dataset['Weekly_Sales'].shift(3) dataset['Lag4'] = dataset['Weekly_Sales'].shift(4) dataset['Lag5'] = dataset['Weekly_Sales'].shift(5) dataset['Lag6'] = dataset['Weekly_Sales'].shift(6) to_be_predicted = dataset['Weekly_Sales'] dataset = dataset.drop(columns = ['Weekly_Sales']) X_train,X_test,Y_train,Y_test = train_test_split(dataset, to_be_predicted, random_state = 42, test_size = 0.3) return (X_train,Y_train,X_test)
def main(): print("Qual modelo quer treinar?") value = input( "1: Regressão Logistica, 2: Multinomial Naive Bayes, 3: CNN, digite o numero correspondente ao modelo:" ) # import dataset train, test = common_modules.merge_files() if value == '1': pipeline = pdp.ApplyByCols("texto", common_modules.tag_remove, "clean_texto", drop=False) pipeline += pdp.ApplyByCols("clean_texto", common_modules.trat_texto) train = pipeline(train) test = pipeline(test) print("Treinando modelo de regressão logistica") regressao_logistica.train_model(train.texto, train.label, test.texto, test.label) if value == '2': pipeline = pdp.ApplyByCols("texto", common_modules.tag_remove, "clean_texto", drop=False) pipeline += pdp.ApplyByCols("clean_texto", common_modules.trat_texto) train = pipeline(train) test = pipeline(test) print("treinando modelo de Multinomial Naive Bayes") naive_bayes.train_model(train.texto, train.label, test.texto, test.label) if value == '3': import CNN print("treinando modelo de CNN") CNN.train_model(train, test)
def make_conversion_pipe(): """ Creates the Pandas Pipeline for the transformation of the UN Geoscheme DataFrame """ pipeline = pdp.ColRename({i: str(i) for i in range(0, 5)}) pipeline += pdp.ApplyByCols( ['country/region', 'numeric', '0', '1', '2', '3', '4'], func=replace_new_lines) pipeline += pdp.ApplyByCols(['numeric', '0', '1', '2', '3', '4'], func=clean_out_world) pipeline += pdp.DropNa(axis=1, how='all') return pipeline
def main(): train, test = common_modules.merge_files() pipeline = pdp.ApplyByCols("texto", common_modules.tag_remove, "clean_texto", drop=False) pipeline += pdp.ApplyByCols("clean_texto", common_modules.trat_texto) train = pipeline(train) test = pipeline(test) reg_log(train.texto, train.label, test.texto, test.label) naive_bayes(train.texto, train.label, test.texto, test.label)
def _original_code(): start = time.time() salesdata = pd.read_csv("processed_salesdata.csv") pline = pdp.PdPipeline([ pdp.Schematize(COLUMNS), pdp.ApplyByCols("category_group", lambda x: "tops" if x == "tops" else "other"), pdp.ApplyByCols(["date", "shelf_date", "end_of_season"], pd.to_datetime), pdp.ApplyToRows(lambda row: pd.Series({ "standard_amount": row["original_price"] * row["sales"], "sales_discount": 0 if (row["original_price"] * row["sales"] <= 0) else row[ "sales_amount"] / ((row["original_price"] * row["sales"])), "week": int(row["date"].strftime('%W')), "days_on_counter": (row["date"] - row["shelf_date"]) / np.timedelta64(1, 'D'), "life_cycle": (row["end_of_season"] - row["shelf_date"]) / (np.timedelta64(1, 'D')), "C1": 1 if row["category_group"] == "tops" else 0, "C2": 1 if row["category_group"] == "other" else 0, "sales": 0 if row["sales"] < 0 else row["sales"], "passenger_flow": 0 if row["passenger_flow"] < 0 else (row["passenger_flow"]), "plus_purchase": 0 if row["plus_purchase"] < 0 else (row["plus_purchase"]), })), pdp.AdHocStage( lambda df: df[df["days_on_counter"] <= df["life_cycle"]]), pdp.ColDrop("activity_level") ]) salesdata = pline.apply(salesdata, verbose=True, exraise=True) salesdata_cumitems = salesdata[[ "SKC", "date", "sales", "passenger_flow", "plus_purchase" ]].sort_values(by=["SKC", "date"]).groupby(['SKC']).cumsum() salesdata_cumitems.columns = [ "total_sales", "total_passenger_flow", "total_plus_purchase" ] salesdata["total_sales"] = salesdata_cumitems["total_sales"] salesdata["total_passenger_flow"] = salesdata_cumitems[ "total_passenger_flow"] salesdata["total_plus_purchase"] = salesdata_cumitems[ "total_plus_purchase"] print("consumed time(s)=", time.time() - start)
def uniformize_boolean(columns): start_time = time.time() true_strings = ['t', 'true', 'yes', 'y', True] false_strings = ['f', 'false', 'n', 'no', False] func_true = lambda x: 1.0 if x in true_strings else x func_false = lambda x: 0.0 if x in false_strings else x result = pdp.ApplyByCols(columns, func_true) + pdp.ApplyByCols( columns, func_false) time_elapsed = time.time() - start_time print("uniformize_boolean:", time_elapsed) return result
def check_sk_pipeline(): pline = pdp.make_pdpipeline( pdp.ApplyByCols("ph", lambda x: x - 1), # pdp.Bin({"ph": [0, 3, 5, 12]}), pdp.Encode(["type", "lbl"]), ) print(pline) model_pline = make_pipeline( pdp.FreqDrop(2, "lbl"), LogisticRegression(), ) print(model_pline) train = _train_df() res_train = pline(train) print("Processed train set: {}".format(res_train)) x_train, y_train = x_y_by_col_lbl(res_train, "lbl") model_pline = model_pline.fit(x_train, y_train) print("Fitted model pipeline: {}".format(model_pline)) test = _test_df() res_test = pline(test) print("Processed test set: {}".format(res_test)) x_test, y_test = x_y_by_col_lbl(res_test, "lbl") predictions = model_pline.predict(x_test) print("predictions: {}".format(predictions))
def uniformize_missing(columns): start_time = time.time() missing_values_strings = ['NaN', '??', '*', 'UNK', '-', '###'] func = lambda x: np.nan if x in missing_values_strings else x time_elapsed = time.time() - start_time result = pdp.ApplyByCols(columns, func) print("uniformize_missing:", time_elapsed) return result
def uniformize_percentage(): start_time = time.time() percentage_columns = ['host_response_rate'] func = lambda x: float(x[:-1]) if type(x) == str else x result = pdp.ApplyByCols(percentage_columns, func) time_elapsed = time.time() - start_time print("uniformize_percentage:", time_elapsed) return result
def main(): train, test = common_modules.merge_files() pipeline = pdp.ApplyByCols("texto", common_modules.tag_remove, "clean_texto", drop=False) pipeline += pdp.ApplyByCols("clean_texto", common_modules.trat_texto) train = pipeline(train) print("Fazendo gridSearch da Regressão logistica") score_log_reg, param_log_reg = reg_log(train.texto, train.label) print( f"Regressão linear best score {score_log_reg}, com parâmetros {param_log_reg} " ) print("Fazendo gridSearch da SVM") score_svm, param_svm = sup_vec(train.texto, train.label) print(f"suport vector best score {score_svm}, com parâmetros {param_svm} ") print("Fazendo gridSearch da Naive bayes") score_nb, param_nb = naive_bayes(train.texto, train.label) print( f"Multinomial Naive bayes best score {score_nb}, com parâmetros {param_nb} " )
def uniformize_monetary(): start_time = time.time() monetary_columns = [ 'price', 'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee', 'extra_people' ] func = lambda x: float(x[1:].replace(',', '')) if type(x) == str else x result = pdp.ApplyByCols(monetary_columns, func) time_elapsed = time.time() - start_time print("uniformize_monetary:", time_elapsed) return result
cnxn, cursor = sql_data.connect() df = sql_data.executeQueryFromFile(cnxn) sql_subwaystations = bsql.SQL(sql_file_name_coordinates_subwaystations, sql_file_path) cnxn, cursor = sql_subwaystations.connect() df_subwaystations = sql_subwaystations.executeQueryFromFile(cnxn) sql_addresses = bsql.SQL(sql_file_name_coordinates_addresses, sql_file_path) cnxn, cursor = sql_addresses.connect() df_addresses = sql_addresses.executeQueryFromFile(cnxn) geodesic(df_subwaystations[['Latitude', 'Longitude']], df_addresses[['Latitude', 'Longitude']]) pipeline = pdp.ApplyByCols('District', District_transformation, 'District') pipeline += pdp.ApplyByCols('BuiltYear', builtYear_transformation, 'BuiltYear') pipeline += pdp.RowDrop({'District': lambda x: x == None}) pipeline += pdp.RowDrop({'OperatingCostInSek': lambda x: pd.isnull(x) == True}) pipeline += pdp.RowDrop({'NumberOfRooms': lambda x: x == 0}) pipeline += pdp.RowDrop({'FloorNumber': lambda x: pd.isnull(x) == True}) pipeline += pdp.RowDrop({'BuiltYear': lambda x: pd.isnull(x) == True}) pipeline += pdp.OneHotEncode('District') pipeline += pdp.OneHotEncode('BuiltYear') pipeline += pdp.ColDrop(['Address']) df_pipeline = pipeline(df) variables = GoMining(df_pipeline) MiningReport(variables) formula = 'SoldPricePerSquaredMeterInSek ~ MonthlyChargeInSek + \
data = pd.read_csv('C:\\Users\\13810\\tmdb_5000_movies.csv') # In[8]: import pdpipe as pdp#导入pdpipe包 # In[13]: #创建pdp.pdppipeline ,其中pdp.ColDrop和pdp.ApplyByCols和pdp.Rowdrop都是pdpipe中常用的API,是固定写法,有固定的语法格式 first_pipeline = pdp.PdPipeline( [pdp.ColDrop("original_title"),#ColDrop用于对指定单个或多个列进行丢弃 pdp.ApplyByCols(columns=['title'], func=lambda x: x.lower()),#对每一列进行应用 pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}),#对行进行删除 pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), result_columns=['genres_num']), pdp.RowDrop({'genres_num': lambda x: x <= 5})] ) # In[19]: data1=first_pipeline(data, verbose=True).reset_index(drop=True) pdp.RowDrop({'vote_average': lambda x: x <= 7, 'original_language': lambda x: x != 'en'}), pdp.ApplyByCols(columns=['genres'], func=lambda x: [item['name'] for item in eval(x)].__len__(), re # In[20]:
!gsutil -m cp -r gs://{bucket_name}/* /content/drive/My\ Drive/CoronaTweets/ base_db_folder = '/content/drive/My Drive/CoronaTweets' tweet_db_paths = [ # incomplete data - '/corona_tweets_1M.db/corona_tweets_1M.db', # 27.02.2020 10:36 01.03.2020 18:24 1578957 # malformed - '/corona_tweets_2M_2/corona_tweets_2M_2.db', # 02.03.2020 17:27 07.03.2020 4:57 2268665 '/corona_tweets_3M/tweets.db', # 07.03.2020 5:06 14.03.2020 4:46 7472368 '/corona_tweets_1M/tweets.db', # 14.03.2020 5:23 15.03.2020 3:16 1903768 '/corona_tweets_2M_3/tweets.db', # 15.03.2020 3:28 16.03.2020 4:31 2081576 '/corona_tweets_1M_2/tweets.db', # 16.03.2020 4:38 17.03.2020 3:08 1889781 '/corona_tweets_2L/tweets.db' # 17.03.2020 3:12 17.03.2020 6:10 280304 ] pipeline = pdp.PdPipeline([ pdp.ColRename({'unix': 'tweet_date'}), pdp.ApplyByCols('sentiment', is_positive, 'is_positive', drop=False), pdp.ApplyByCols('sentiment', is_negative, 'is_negative', drop=False), pdp.ApplyByCols('sentiment', is_neutral, 'is_neutral', drop=False), ]) tweets_df = pd.DataFrame() for tweets_db in tweet_db_paths: full_tweet_db_path = base_db_folder + tweets_db print(dt.datetime.now(), "Processing started: ", full_tweet_db_path) conn = sqlite3.connect(full_tweet_db_path) c = conn.cursor() df_pie = pd.read_sql("SELECT * FROM sentiment", conn) df_pie['unix'] = pd.to_datetime(df_pie['unix'], unit='ms').dt.date # cast to date df = pipeline.apply(df_pie).sort_values(by=['tweet_date'])