def test_init_fit_predict_spark_pandas(self): from lale.datasets import pandas2spark from lale.datasets.util import spark_installed if spark_installed: trainable_cf = ConcatFeatures() A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]] B = [[14, 15], [24, 25], [34, 35]] A = pd.DataFrame(A, columns=["a", "b", "c"]) B = pd.DataFrame(B, columns=["d", "e"]) A = pandas2spark(A, add_index=True) A = add_table_name(A, "A") B = add_table_name(B, "B") trained_cf = trainable_cf.fit(X=[A, B]) transformed = trained_cf.transform([A, B]) expected = [ [11, 12, 13, 14, 15], [21, 22, 23, 24, 25], [31, 32, 33, 34, 35], ] expected = pd.DataFrame(expected, columns=["a", "b", "c", "d", "e"]) for c in expected.columns: self.assertEqual(list(transformed[c]), list(expected[c]))
def transform_pandas_df(self, X): mapped_df = pd.DataFrame() accessed_column_names = set() def get_map_function_output(column, new_column_name): _validate(X, column) new_column_name = _new_column_name(new_column_name, column) new_column = eval_expr_pandas_df(X, column) mapped_df[new_column_name] = new_column accessed_column_names.add(new_column_name) accessed_column_names.update(_accessed_columns(column)) columns = self.columns if callable(columns): columns = columns(X) if isinstance(columns, list): for column in columns: get_map_function_output(column, None) elif isinstance(columns, dict): for new_column_name, column in columns.items(): get_map_function_output(column, new_column_name) else: raise ValueError("columns must be either a list or a dictionary.") if self.remainder == "passthrough": remainder_columns = [ x for x in X.columns if x not in accessed_column_names ] mapped_df[remainder_columns] = X[remainder_columns] table_name = get_table_name(X) mapped_df = add_table_name(mapped_df, table_name) return mapped_df
def to_monoid(self, batch: _Batch) -> _AccuracyData: from lale.lib.rasl import Scan y_true, y_pred = batch assert isinstance(y_true, pd.Series), type(y_true) # TODO: Spark if isinstance(y_pred, np.ndarray): y_pred = pd.Series(y_pred, y_true.index, y_true.dtype, "y_pred") assert isinstance(y_pred, pd.Series), type(y_pred) # TODO: Spark y_true = add_table_name(pd.DataFrame(y_true), "y_true") y_pred = add_table_name(pd.DataFrame(y_pred), "y_pred") prefix_true = Scan(table=it.y_true) >> Map( columns={"y_true": it[get_columns(y_true)[0]]}) prefix_pred = Scan(table=it.y_pred) >> Map( columns={"y_pred": it[get_columns(y_pred)[0]]}) pipeline = (prefix_true & prefix_pred) >> self._pipeline_suffix agg_df = _ensure_pandas(pipeline.transform([y_true, y_pred])) return _AccuracyData(*agg_df.iloc[0])
def fetch_go_sales_dataset(datatype="pandas"): """ Fetches the Go_Sales dataset from IBM's Watson's ML samples. It contains information about daily sales, methods, retailers and products of a company in form of 5 CSV files. This method downloads and stores these 5 CSV files under the 'lale/lale/datasets/multitable/go_sales_data' directory. It creates this directory by itself if it does not exists. Dataset URL: https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/go_sales/ Parameters ---------- datatype : string, optional, default 'pandas' If 'pandas', Returns a list of singleton dictionaries (each element of the list is one table from the dataset) after reading the downloaded CSV files. The key of each dictionary is the name of the table and the value contains a pandas dataframe consisting of the data. If 'spark', Returns a list of singleton dictionaries (each element of the list is one table from the dataset) after reading the downloaded CSV files. The key of each dictionary is the name of the table and the value contains a spark dataframe consisting of the data. Else, Throws an error as it does not support any other return type. Returns ------- go_sales_list : list of singleton dictionary of pandas / spark dataframes """ download_data_dir = os.path.join(os.path.dirname(__file__), "go_sales_data") base_url = "https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/go_sales/" filenames = [ "go_1k.csv", "go_daily_sales.csv", "go_methods.csv", "go_products.csv", "go_retailers.csv", ] go_sales_list = [] for file in filenames: data_file_name = os.path.join(download_data_dir, file) if not os.path.exists(data_file_name): if not os.path.exists(download_data_dir): os.makedirs(download_data_dir) urllib.request.urlretrieve(base_url + file, data_file_name) logger.info(" Created: {}".format(data_file_name)) table_name = file.split(".")[0] data_frame = get_data_from_csv(datatype, data_file_name) go_sales_list.append(add_table_name(data_frame, table_name)) logger.info(" Fetched the Go_Sales dataset. Process completed.") return go_sales_list
def fetch_imdb_dataset(datatype="pandas"): """ Fetches the IMDB movie dataset from Relational Dataset Repo. It contains information about directors, actors, roles and genres of multiple movies in form of 7 CSV files. This method downloads and stores these 7 CSV files under the 'lale/lale/datasets/multitable/imdb_data' directory. It creates this directory by itself if it does not exists. Dataset URL: https://relational.fit.cvut.cz/dataset/IMDb Parameters ---------- datatype : string, optional, default 'pandas' If 'pandas', Returns a list of singleton dictionaries (each element of the list is one table from the dataset) after reading the already existing CSV files. The key of each dictionary is the name of the table and the value contains a pandas dataframe consisting of the data. If 'spark', Returns a list of singleton dictionaries (each element of the list is one table from the dataset) after reading the already existing CSV files. The key of each dictionary is the name of the table and the value contains a spark dataframe consisting of the data. Else, Throws an error as it does not support any other return type. Returns ------- imdb_list : list of singleton dictionary of pandas / spark dataframes """ download_data_dir = os.path.join(os.path.dirname(__file__), "imdb_data") imdb_list = [] if not os.path.exists(download_data_dir): raise ValueError( "IMDB dataset not found at {}. Please download it using lalegpl repository." .format(download_data_dir)) else: for root, dirs, files in os.walk(download_data_dir): for file in files: filename, extension = os.path.splitext(file) if extension == ".csv": data_file_name = os.path.join(download_data_dir, file) table_name = filename data_frame = get_data_from_csv(datatype, data_file_name) imdb_list.append(add_table_name(data_frame, table_name)) if len(imdb_list) == 7: logger.info(" Fetched the IMDB dataset. Process completed.") else: raise ValueError( "Incomplete IMDB dataset found at {}. Please download complete dataset using lalegpl repository." .format(download_data_dir)) return imdb_list
def test_init_fit_predict_pandas_series(self): trainable_cf = ConcatFeatures() A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]] B = [14, 24, 34] A = pd.DataFrame(A, columns=["a", "b", "c"]) B = pd.Series(B, name="d") A = add_table_name(A, "A") B = add_table_name(B, "B") trained_cf = trainable_cf.fit(X=[A, B]) transformed = trained_cf.transform([A, B]) expected = [ [11, 12, 13, 14], [21, 22, 23, 24], [31, 32, 33, 34], ] expected = pd.DataFrame(expected, columns=["a", "b", "c", "d"]) for c in expected.columns: self.assertEqual(list(transformed[c]), list(expected[c]))
def transform(self, X): group_by_keys = [] for by_element in self.by if self.by is not None else []: expr_to_parse = by_element._expr group_by_keys.append(self._get_group_key(expr_to_parse)) col_not_in_X = np.setdiff1d(group_by_keys, get_columns(X)) if col_not_in_X.size > 0: raise ValueError( "GroupBy key columns {} not present in input dataframe X.". format(col_not_in_X)) if _is_spark_with_index(X): name = get_table_name(X) X = add_table_name(X.drop(get_index_name(X)), name) if _is_spark_df(X): grouped_df = X.groupby(group_by_keys) elif _is_pandas_df(X): grouped_df = X.groupby(group_by_keys, sort=False) else: raise ValueError( "Only pandas and spark dataframes are supported by the GroupBy operator." ) named_grouped_df = add_table_name(grouped_df, get_table_name(X)) return named_grouped_df
def pandas2spark(pandas_df, add_index=False, index_name=None): assert spark_installed spark_conf = (SparkConf().setMaster("local[2]").set( "spark.driver.bindAddress", "127.0.0.1")) spark_context = SparkContext.getOrCreate(conf=spark_conf) spark_sql_context = pyspark.sql.SQLContext(spark_context) name = get_table_name(pandas_df) if isinstance(pandas_df, pd.Series): pandas_df = pandas_df.to_frame() if add_index: if index_name is None: if pandas_df.index.name is None: index_name = "index" else: index_name = pandas_df.index.name index_col = pd.DataFrame(pandas_df.index, index=pandas_df.index, columns=[index_name]) pandas_df = pd.concat([pandas_df, index_col], axis=1) spark_dataframe = spark_sql_context.createDataFrame(pandas_df) if index_name is not None: spark_dataframe = SparkDataFrameWithIndex(spark_dataframe, index_name) return add_table_name(spark_dataframe, name)
def multitable_train_test_split( dataset, main_table_name, label_column_name, test_size=0.25, random_state=None, ) -> Tuple: """ Splits X and y into random train and test subsets stratified by labels and protected attributes. Behaves similar to the `train_test_split`_ function from scikit-learn. .. _`train_test_split`: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html Parameters ---------- dataset : list of either Pandas or Spark dataframes Each dataframe in the list corresponds to an entity/table in the multi-table setting. main_table_name : string The name of the main table as the split is going to be based on the main table. label_column_name : string The name of the label column from the main table. test_size : float or int, default=0.25 If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. random_state : int, RandomState instance or None, default=None Controls the shuffling applied to the data before applying the split. Pass an integer for reproducible output across multiple function calls. - None RandomState used by numpy.random - numpy.random.RandomState Use the provided random state, only affecting other users of that same random state instance. - integer Explicit seed. Returns ------- result : tuple - item 0: train_X, List of datasets corresponding to the train split - item 1: test_X, List of datasets corresponding to the test split - item 2: train_y - item 3: test_y """ main_table_df = None index_of_main_table = -1 for i, df in enumerate(dataset): if get_table_name(df) == main_table_name: main_table_df = df index_of_main_table = i if main_table_df is None: table_names = [get_table_name(df) for df in dataset] raise ValueError( f"Could not find {main_table_name} in the given dataset, the table names are {table_names}" ) if _is_pandas_df(main_table_df): num_rows = len(main_table_df) elif _is_spark_df(main_table_df): # main_table_df = main_table_df.toPandas() num_rows = main_table_df.count() else: raise ValueError( "multitable_train_test_split can only work with a list of Pandas or Spark dataframes." ) if test_size > 0 and test_size < 1: num_test_rows = int(num_rows * test_size) else: num_test_rows = test_size test_indices = random.choice(range(num_rows), num_test_rows, replace=False) train_indices = list(set([*range(num_rows)]) - set(test_indices.tolist())) assert len(test_indices) + len(train_indices) == num_rows train_dataset = [table for table in dataset] test_dataset = [table for table in dataset] if _is_pandas_df(main_table_df): train_main_df = main_table_df.iloc[train_indices] test_main_df = main_table_df.iloc[test_indices] train_y = train_main_df[label_column_name] test_y = test_main_df[label_column_name] elif _is_spark_df(main_table_df): spark_session = SparkSession.builder.appName( "multitable_train_test_split").getOrCreate() train_main_df = spark_session.createDataFrame( data=main_table_df.toPandas().iloc[train_indices]) test_main_df = spark_session.createDataFrame( data=main_table_df.toPandas().iloc[test_indices]) train_y = train_main_df.select(label_column_name) test_y = test_main_df.select(label_column_name) else: raise ValueError( "multitable_train_test_split can only work with a list of Pandas or Spark dataframes." ) train_main_df = add_table_name(train_main_df, main_table_name) test_main_df = add_table_name(test_main_df, main_table_name) train_dataset[index_of_main_table] = train_main_df test_dataset[index_of_main_table] = test_main_df return train_dataset, test_dataset, train_y, test_y
def fetch_imdb_dataset(datatype="pandas"): """ Fetches the IMDB movie dataset from Relational Dataset Repo. It contains information about directors, actors, roles and genres of multiple movies in form of 7 CSV files. This method downloads and stores these 7 CSV files under the 'lale/lale/datasets/multitable/imdb_data' directory. It creates this directory by itself if it does not exists. Dataset URL: https://relational.fit.cvut.cz/dataset/IMDb Parameters ---------- datatype : string, optional, default 'pandas' If 'pandas', Returns a list of singleton dictionaries (each element of the list is one table from the dataset) after reading the downloaded / existing CSV files. The key of each dictionary is the name of the table and the value contains a pandas dataframe consisting of the data. If 'spark', Returns a list of singleton dictionaries (each element of the list is one table from the dataset) after reading the downloaded / existing CSV files. The key of each dictionary is the name of the table and the value contains a spark dataframe consisting of the data. Else, Throws an error as it does not support any other return type. Returns ------- imdb_list : list of singleton dictionary of pandas / spark dataframes """ try: cnx = mysql.connector.connect(**imdb_config) cursor = cnx.cursor() imdb_table_list = [] download_data_dir = os.path.join(os.path.dirname(__file__), "imdb_data") imdb_list = [] cursor.execute("show tables") for table in cursor: imdb_table_list.append(table[0]) for table in imdb_table_list: header_list = [] cursor.execute("desc {}".format(table)) for column in cursor: header_list.append(column[0]) csv_name = "{}.csv".format(table) data_file_name = os.path.join(download_data_dir, csv_name) if not os.path.exists(data_file_name): if not os.path.exists(download_data_dir): os.makedirs(download_data_dir) cursor.execute("select * from {}".format(table)) result = cursor.fetchall() file = open(data_file_name, "w", encoding="utf-8") c = csv.writer(file) c.writerow(header_list) for row in result: c.writerow(row) file.close() logger.info(" Created:{}".format(data_file_name)) table_name = csv_name.split(".")[0] data_frame = get_data_from_csv(datatype, data_file_name) imdb_list.append(add_table_name(data_frame, table_name)) logger.info(" Fetched the IMDB dataset. Process completed.") return imdb_list except mysql.connector.Error as err: raise ValueError(err) else: cnx.close()
def transform(self, X): # X is assumed to be a list of datasets with get_table_name(d) != None joined_df = pd.DataFrame() tables_encountered = set() # Implementation of join operator def join_df(left_df, right_df): # Joining spark dataframes if _is_spark_df(left_df) and _is_spark_df(right_df): on = [] drop_col = [] left_table = left_df.alias("left_table") right_table = right_df.alias("right_table") for k, key in enumerate(left_key_col): on.append( col("{}.{}".format("left_table", key)).eqNullSafe( col("{}.{}".format("right_table", right_key_col[k])) ) ) if key == right_key_col[k]: drop_col.append(key) op_df = left_table.join(right_table, on, self.join_type) for key in drop_col: op_df = op_df.drop(getattr(right_table, key)) return op_df # Joining pandas dataframes op_df = pd.merge( left_df, right_df, how=self.join_type, left_on=left_key_col, right_on=right_key_col, ) return op_df def fetch_one_df(named_df, table_name): if get_table_name(named_df) == table_name: return named_df return None def fetch_df(left_table_name, right_table_name): left_df = [] right_df = [] for named_df in X: if not tables_encountered: left_df_candidate = fetch_one_df(named_df, left_table_name) if _is_df(left_df_candidate): left_df = left_df_candidate right_df_candidate = fetch_one_df(named_df, right_table_name) if _is_df(right_df_candidate): right_df = right_df_candidate else: if left_table_name in tables_encountered: left_df = joined_df right_df_candidate = fetch_one_df(named_df, right_table_name) if _is_df(right_df_candidate): right_df = right_df_candidate elif right_table_name in tables_encountered: right_df = joined_df left_df_candidate = fetch_one_df(named_df, left_table_name) if _is_df(left_df_candidate): left_df = left_df_candidate return left_df, right_df def remove_implicit_col(key_col, df): if _is_spark_with_index(df): index = get_index_name(df) if index not in key_col: df = df.drop(index) return df def add_index(left_key_col, left_df, right_key_col, right_df, joined_df): if _is_spark_with_index(left_df) and _is_spark_with_index(right_df): left_name = get_index_name(left_df) right_name = get_index_name(right_df) if left_name in left_key_col and right_name in right_key_col: if left_name == right_name: joined_df = SparkDataFrameWithIndex( joined_df, index_name=left_name ) else: warnings.warn(f"New data column {right_name}") joined_df = SparkDataFrameWithIndex( joined_df, index_name=left_name ) elif left_name in left_key_col: joined_df = SparkDataFrameWithIndex(joined_df, index_name=left_name) elif right_name in right_key_col: joined_df = SparkDataFrameWithIndex( joined_df, index_name=right_name ) else: pass elif _is_spark_with_index(left_df): index_name = get_index_name(left_df) if index_name in left_key_col: joined_df = SparkDataFrameWithIndex( joined_df, index_name=index_name ) elif _is_spark_with_index(right_df): index_name = get_index_name(right_df) if index_name in right_key_col: joined_df = SparkDataFrameWithIndex( joined_df, index_name=index_name ) else: assert False return joined_df # Iterate over all the elements of the predicate for pred_element in self.pred if self.pred is not None else []: left_table_name = "" left_key_col = [] right_table_name = "" right_key_col = [] if isinstance(pred_element, list): # Prepare composite key to apply join once for all the participating columns together for sub_pred_element in pred_element: ( left_table_name, temp_left_key, right_table_name, temp_right_key, ) = self._get_join_info(sub_pred_element._expr) left_key_col.extend(temp_left_key) right_key_col.extend(temp_right_key) else: ( left_table_name, left_key_col, right_table_name, right_key_col, ) = self._get_join_info(pred_element._expr) left_df, right_df = fetch_df(left_table_name, right_table_name) if not _is_df(left_df) or not _is_df(right_df): raise ValueError( "ERROR: Cannot perform join operation, either '{}' or '{}' table not present in input X!".format( left_table_name, right_table_name ) ) left_df = remove_implicit_col(left_key_col, left_df) right_df = remove_implicit_col(right_key_col, right_df) columns_in_both_tables = set(get_columns(left_df)).intersection( # type: ignore set(get_columns(right_df)) # type: ignore ) if columns_in_both_tables and not set( sorted(columns_in_both_tables) ) == set(sorted(left_key_col + right_key_col)): raise ValueError( "Cannot perform join operation! Non-key columns cannot be duplicate." ) joined_df = join_df(left_df, right_df) if _is_spark_with_index(left_df) or _is_spark_with_index(right_df): joined_df = add_index( left_key_col, left_df, right_key_col, right_df, joined_df ) tables_encountered.add(left_table_name) tables_encountered.add(right_table_name) return add_table_name(joined_df, self.name)