Ejemplo n.º 1
0
    def test_init_fit_predict_spark_pandas(self):

        from lale.datasets import pandas2spark
        from lale.datasets.util import spark_installed

        if spark_installed:
            trainable_cf = ConcatFeatures()
            A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]]
            B = [[14, 15], [24, 25], [34, 35]]
            A = pd.DataFrame(A, columns=["a", "b", "c"])
            B = pd.DataFrame(B, columns=["d", "e"])
            A = pandas2spark(A, add_index=True)
            A = add_table_name(A, "A")
            B = add_table_name(B, "B")

            trained_cf = trainable_cf.fit(X=[A, B])
            transformed = trained_cf.transform([A, B])
            expected = [
                [11, 12, 13, 14, 15],
                [21, 22, 23, 24, 25],
                [31, 32, 33, 34, 35],
            ]
            expected = pd.DataFrame(expected,
                                    columns=["a", "b", "c", "d", "e"])
            for c in expected.columns:
                self.assertEqual(list(transformed[c]), list(expected[c]))
Ejemplo n.º 2
0
Archivo: map.py Proyecto: hirzel/lale
    def transform_pandas_df(self, X):
        mapped_df = pd.DataFrame()
        accessed_column_names = set()

        def get_map_function_output(column, new_column_name):
            _validate(X, column)
            new_column_name = _new_column_name(new_column_name, column)
            new_column = eval_expr_pandas_df(X, column)
            mapped_df[new_column_name] = new_column
            accessed_column_names.add(new_column_name)
            accessed_column_names.update(_accessed_columns(column))

        columns = self.columns
        if callable(columns):
            columns = columns(X)

        if isinstance(columns, list):
            for column in columns:
                get_map_function_output(column, None)
        elif isinstance(columns, dict):
            for new_column_name, column in columns.items():
                get_map_function_output(column, new_column_name)
        else:
            raise ValueError("columns must be either a list or a dictionary.")
        if self.remainder == "passthrough":
            remainder_columns = [
                x for x in X.columns if x not in accessed_column_names
            ]
            mapped_df[remainder_columns] = X[remainder_columns]
        table_name = get_table_name(X)
        mapped_df = add_table_name(mapped_df, table_name)
        return mapped_df
Ejemplo n.º 3
0
    def to_monoid(self, batch: _Batch) -> _AccuracyData:
        from lale.lib.rasl import Scan

        y_true, y_pred = batch
        assert isinstance(y_true, pd.Series), type(y_true)  # TODO: Spark
        if isinstance(y_pred, np.ndarray):
            y_pred = pd.Series(y_pred, y_true.index, y_true.dtype, "y_pred")
        assert isinstance(y_pred, pd.Series), type(y_pred)  # TODO: Spark
        y_true = add_table_name(pd.DataFrame(y_true), "y_true")
        y_pred = add_table_name(pd.DataFrame(y_pred), "y_pred")
        prefix_true = Scan(table=it.y_true) >> Map(
            columns={"y_true": it[get_columns(y_true)[0]]})
        prefix_pred = Scan(table=it.y_pred) >> Map(
            columns={"y_pred": it[get_columns(y_pred)[0]]})
        pipeline = (prefix_true & prefix_pred) >> self._pipeline_suffix
        agg_df = _ensure_pandas(pipeline.transform([y_true, y_pred]))
        return _AccuracyData(*agg_df.iloc[0])
Ejemplo n.º 4
0
def fetch_go_sales_dataset(datatype="pandas"):
    """
    Fetches the Go_Sales dataset from IBM's Watson's ML samples.
    It contains information about daily sales, methods, retailers
    and products of a company in form of 5 CSV files.
    This method downloads and stores these 5 CSV files under the
    'lale/lale/datasets/multitable/go_sales_data' directory. It creates
    this directory by itself if it does not exists.

    Dataset URL: https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/go_sales/

    Parameters
    ----------
    datatype : string, optional, default 'pandas'

      If 'pandas',
      Returns a list of singleton dictionaries (each element of the list is one
      table from the dataset) after reading the downloaded CSV files. The key of
      each dictionary is the name of the table and the value contains a pandas
      dataframe consisting of the data.

      If 'spark',
      Returns a list of singleton dictionaries (each element of the list is one
      table from the dataset) after reading the downloaded CSV files. The key of
      each dictionary is the name of the table and the value contains a spark
      dataframe consisting of the data.

      Else,
      Throws an error as it does not support any other return type.

    Returns
    -------
    go_sales_list : list of singleton dictionary of pandas / spark dataframes
    """

    download_data_dir = os.path.join(os.path.dirname(__file__),
                                     "go_sales_data")
    base_url = "https://github.com/IBM/watson-machine-learning-samples/raw/master/cloud/data/go_sales/"
    filenames = [
        "go_1k.csv",
        "go_daily_sales.csv",
        "go_methods.csv",
        "go_products.csv",
        "go_retailers.csv",
    ]
    go_sales_list = []
    for file in filenames:
        data_file_name = os.path.join(download_data_dir, file)
        if not os.path.exists(data_file_name):
            if not os.path.exists(download_data_dir):
                os.makedirs(download_data_dir)
            urllib.request.urlretrieve(base_url + file, data_file_name)
            logger.info(" Created: {}".format(data_file_name))
        table_name = file.split(".")[0]
        data_frame = get_data_from_csv(datatype, data_file_name)
        go_sales_list.append(add_table_name(data_frame, table_name))
    logger.info(" Fetched the Go_Sales dataset. Process completed.")
    return go_sales_list
Ejemplo n.º 5
0
def fetch_imdb_dataset(datatype="pandas"):
    """
    Fetches the IMDB movie dataset from Relational Dataset Repo.
    It contains information about directors, actors, roles
    and genres of multiple movies in form of 7 CSV files.
    This method downloads and stores these 7 CSV files under the
    'lale/lale/datasets/multitable/imdb_data' directory. It creates
    this directory by itself if it does not exists.

    Dataset URL: https://relational.fit.cvut.cz/dataset/IMDb

    Parameters
    ----------
    datatype : string, optional, default 'pandas'

      If 'pandas',
      Returns a list of singleton dictionaries (each element of the list is one
      table from the dataset) after reading the already existing CSV files.
      The key of each dictionary is the name of the table and the value contains
      a pandas dataframe consisting of the data.

      If 'spark',
      Returns a list of singleton dictionaries (each element of the list is one
      table from the dataset) after reading the already existing CSV files.
      The key of each dictionary is the name of the table and the value contains
      a spark dataframe consisting of the data.

      Else,
      Throws an error as it does not support any other return type.

    Returns
    -------
    imdb_list : list of singleton dictionary of pandas / spark dataframes
    """

    download_data_dir = os.path.join(os.path.dirname(__file__), "imdb_data")
    imdb_list = []
    if not os.path.exists(download_data_dir):
        raise ValueError(
            "IMDB dataset not found at {}. Please download it using lalegpl repository."
            .format(download_data_dir))
    else:
        for root, dirs, files in os.walk(download_data_dir):
            for file in files:
                filename, extension = os.path.splitext(file)
                if extension == ".csv":
                    data_file_name = os.path.join(download_data_dir, file)
                    table_name = filename
                    data_frame = get_data_from_csv(datatype, data_file_name)
                    imdb_list.append(add_table_name(data_frame, table_name))
        if len(imdb_list) == 7:
            logger.info(" Fetched the IMDB dataset. Process completed.")
        else:
            raise ValueError(
                "Incomplete IMDB dataset found at {}. Please download complete dataset using lalegpl repository."
                .format(download_data_dir))
    return imdb_list
Ejemplo n.º 6
0
 def test_init_fit_predict_pandas_series(self):
     trainable_cf = ConcatFeatures()
     A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]]
     B = [14, 24, 34]
     A = pd.DataFrame(A, columns=["a", "b", "c"])
     B = pd.Series(B, name="d")
     A = add_table_name(A, "A")
     B = add_table_name(B, "B")
     trained_cf = trainable_cf.fit(X=[A, B])
     transformed = trained_cf.transform([A, B])
     expected = [
         [11, 12, 13, 14],
         [21, 22, 23, 24],
         [31, 32, 33, 34],
     ]
     expected = pd.DataFrame(expected, columns=["a", "b", "c", "d"])
     for c in expected.columns:
         self.assertEqual(list(transformed[c]), list(expected[c]))
Ejemplo n.º 7
0
 def transform(self, X):
     group_by_keys = []
     for by_element in self.by if self.by is not None else []:
         expr_to_parse = by_element._expr
         group_by_keys.append(self._get_group_key(expr_to_parse))
     col_not_in_X = np.setdiff1d(group_by_keys, get_columns(X))
     if col_not_in_X.size > 0:
         raise ValueError(
             "GroupBy key columns {} not present in input dataframe X.".
             format(col_not_in_X))
     if _is_spark_with_index(X):
         name = get_table_name(X)
         X = add_table_name(X.drop(get_index_name(X)), name)
     if _is_spark_df(X):
         grouped_df = X.groupby(group_by_keys)
     elif _is_pandas_df(X):
         grouped_df = X.groupby(group_by_keys, sort=False)
     else:
         raise ValueError(
             "Only pandas and spark dataframes are supported by the GroupBy operator."
         )
     named_grouped_df = add_table_name(grouped_df, get_table_name(X))
     return named_grouped_df
Ejemplo n.º 8
0
def pandas2spark(pandas_df, add_index=False, index_name=None):
    assert spark_installed
    spark_conf = (SparkConf().setMaster("local[2]").set(
        "spark.driver.bindAddress", "127.0.0.1"))
    spark_context = SparkContext.getOrCreate(conf=spark_conf)
    spark_sql_context = pyspark.sql.SQLContext(spark_context)
    name = get_table_name(pandas_df)
    if isinstance(pandas_df, pd.Series):
        pandas_df = pandas_df.to_frame()
    if add_index:
        if index_name is None:
            if pandas_df.index.name is None:
                index_name = "index"
            else:
                index_name = pandas_df.index.name
        index_col = pd.DataFrame(pandas_df.index,
                                 index=pandas_df.index,
                                 columns=[index_name])
        pandas_df = pd.concat([pandas_df, index_col], axis=1)
    spark_dataframe = spark_sql_context.createDataFrame(pandas_df)
    if index_name is not None:
        spark_dataframe = SparkDataFrameWithIndex(spark_dataframe, index_name)
    return add_table_name(spark_dataframe, name)
Ejemplo n.º 9
0
def multitable_train_test_split(
    dataset,
    main_table_name,
    label_column_name,
    test_size=0.25,
    random_state=None,
) -> Tuple:
    """
    Splits X and y into random train and test subsets stratified by
    labels and protected attributes.

    Behaves similar to the `train_test_split`_ function from scikit-learn.

    .. _`train_test_split`: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

    Parameters
    ----------
    dataset : list of either Pandas or Spark dataframes

      Each dataframe in the list corresponds to an entity/table in the multi-table setting.

    main_table_name : string

      The name of the main table as the split is going to be based on the main table.

    label_column_name : string

      The name of the label column from the main table.

    test_size : float or int, default=0.25

      If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
      If int, represents the absolute number of test samples.

    random_state : int, RandomState instance or None, default=None

      Controls the shuffling applied to the data before applying the split.
      Pass an integer for reproducible output across multiple function calls.

      - None

          RandomState used by numpy.random

      - numpy.random.RandomState

          Use the provided random state, only affecting other users of that same random state instance.

      - integer

          Explicit seed.

    Returns
    -------
    result : tuple

      - item 0: train_X, List of datasets corresponding to the train split

      - item 1: test_X, List of datasets corresponding to the test split

      - item 2: train_y

      - item 3: test_y

    """
    main_table_df = None
    index_of_main_table = -1
    for i, df in enumerate(dataset):
        if get_table_name(df) == main_table_name:
            main_table_df = df
            index_of_main_table = i
    if main_table_df is None:
        table_names = [get_table_name(df) for df in dataset]
        raise ValueError(
            f"Could not find {main_table_name} in the given dataset, the table names are {table_names}"
        )
    if _is_pandas_df(main_table_df):
        num_rows = len(main_table_df)
    elif _is_spark_df(main_table_df):
        # main_table_df = main_table_df.toPandas()
        num_rows = main_table_df.count()
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )
    if test_size > 0 and test_size < 1:
        num_test_rows = int(num_rows * test_size)
    else:
        num_test_rows = test_size
    test_indices = random.choice(range(num_rows), num_test_rows, replace=False)
    train_indices = list(set([*range(num_rows)]) - set(test_indices.tolist()))
    assert len(test_indices) + len(train_indices) == num_rows
    train_dataset = [table for table in dataset]
    test_dataset = [table for table in dataset]
    if _is_pandas_df(main_table_df):
        train_main_df = main_table_df.iloc[train_indices]
        test_main_df = main_table_df.iloc[test_indices]
        train_y = train_main_df[label_column_name]
        test_y = test_main_df[label_column_name]
    elif _is_spark_df(main_table_df):
        spark_session = SparkSession.builder.appName(
            "multitable_train_test_split").getOrCreate()
        train_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[train_indices])
        test_main_df = spark_session.createDataFrame(
            data=main_table_df.toPandas().iloc[test_indices])
        train_y = train_main_df.select(label_column_name)
        test_y = test_main_df.select(label_column_name)
    else:
        raise ValueError(
            "multitable_train_test_split can only work with a list of Pandas or Spark dataframes."
        )

    train_main_df = add_table_name(train_main_df, main_table_name)
    test_main_df = add_table_name(test_main_df, main_table_name)
    train_dataset[index_of_main_table] = train_main_df
    test_dataset[index_of_main_table] = test_main_df
    return train_dataset, test_dataset, train_y, test_y
Ejemplo n.º 10
0
def fetch_imdb_dataset(datatype="pandas"):
    """
    Fetches the IMDB movie dataset from Relational Dataset Repo.
    It contains information about directors, actors, roles
    and genres of multiple movies in form of 7 CSV files.
    This method downloads and stores these 7 CSV files under the
    'lale/lale/datasets/multitable/imdb_data' directory. It creates
    this directory by itself if it does not exists.

    Dataset URL: https://relational.fit.cvut.cz/dataset/IMDb

    Parameters
    ----------
    datatype : string, optional, default 'pandas'

      If 'pandas',
      Returns a list of singleton dictionaries (each element of the list is one
      table from the dataset) after reading the downloaded / existing CSV files.
      The key of each dictionary is the name of the table and the value contains
      a pandas dataframe consisting of the data.

      If 'spark',
      Returns a list of singleton dictionaries (each element of the list is one
      table from the dataset) after reading the downloaded / existing CSV files.
      The key of each dictionary is the name of the table and the value contains
      a spark dataframe consisting of the data.

      Else,
      Throws an error as it does not support any other return type.

    Returns
    -------
    imdb_list : list of singleton dictionary of pandas / spark dataframes
    """

    try:
        cnx = mysql.connector.connect(**imdb_config)
        cursor = cnx.cursor()
        imdb_table_list = []
        download_data_dir = os.path.join(os.path.dirname(__file__),
                                         "imdb_data")
        imdb_list = []
        cursor.execute("show tables")
        for table in cursor:
            imdb_table_list.append(table[0])
        for table in imdb_table_list:
            header_list = []
            cursor.execute("desc {}".format(table))
            for column in cursor:
                header_list.append(column[0])
            csv_name = "{}.csv".format(table)
            data_file_name = os.path.join(download_data_dir, csv_name)
            if not os.path.exists(data_file_name):
                if not os.path.exists(download_data_dir):
                    os.makedirs(download_data_dir)
                cursor.execute("select * from {}".format(table))
                result = cursor.fetchall()
                file = open(data_file_name, "w", encoding="utf-8")
                c = csv.writer(file)
                c.writerow(header_list)
                for row in result:
                    c.writerow(row)
                file.close()
                logger.info(" Created:{}".format(data_file_name))
            table_name = csv_name.split(".")[0]
            data_frame = get_data_from_csv(datatype, data_file_name)
            imdb_list.append(add_table_name(data_frame, table_name))
        logger.info(" Fetched the IMDB dataset. Process completed.")
        return imdb_list
    except mysql.connector.Error as err:
        raise ValueError(err)
    else:
        cnx.close()
Ejemplo n.º 11
0
Archivo: join.py Proyecto: hirzel/lale
    def transform(self, X):
        # X is assumed to be a list of datasets with get_table_name(d) != None
        joined_df = pd.DataFrame()
        tables_encountered = set()

        # Implementation of join operator
        def join_df(left_df, right_df):

            # Joining spark dataframes
            if _is_spark_df(left_df) and _is_spark_df(right_df):
                on = []
                drop_col = []
                left_table = left_df.alias("left_table")
                right_table = right_df.alias("right_table")

                for k, key in enumerate(left_key_col):
                    on.append(
                        col("{}.{}".format("left_table", key)).eqNullSafe(
                            col("{}.{}".format("right_table", right_key_col[k]))
                        )
                    )
                    if key == right_key_col[k]:
                        drop_col.append(key)
                op_df = left_table.join(right_table, on, self.join_type)
                for key in drop_col:
                    op_df = op_df.drop(getattr(right_table, key))
                return op_df

            # Joining pandas dataframes
            op_df = pd.merge(
                left_df,
                right_df,
                how=self.join_type,
                left_on=left_key_col,
                right_on=right_key_col,
            )
            return op_df

        def fetch_one_df(named_df, table_name):
            if get_table_name(named_df) == table_name:
                return named_df
            return None

        def fetch_df(left_table_name, right_table_name):
            left_df = []
            right_df = []
            for named_df in X:
                if not tables_encountered:
                    left_df_candidate = fetch_one_df(named_df, left_table_name)
                    if _is_df(left_df_candidate):
                        left_df = left_df_candidate
                    right_df_candidate = fetch_one_df(named_df, right_table_name)
                    if _is_df(right_df_candidate):
                        right_df = right_df_candidate
                else:
                    if left_table_name in tables_encountered:
                        left_df = joined_df
                        right_df_candidate = fetch_one_df(named_df, right_table_name)
                        if _is_df(right_df_candidate):
                            right_df = right_df_candidate
                    elif right_table_name in tables_encountered:
                        right_df = joined_df
                        left_df_candidate = fetch_one_df(named_df, left_table_name)
                        if _is_df(left_df_candidate):
                            left_df = left_df_candidate
            return left_df, right_df

        def remove_implicit_col(key_col, df):
            if _is_spark_with_index(df):
                index = get_index_name(df)
                if index not in key_col:
                    df = df.drop(index)
            return df

        def add_index(left_key_col, left_df, right_key_col, right_df, joined_df):
            if _is_spark_with_index(left_df) and _is_spark_with_index(right_df):
                left_name = get_index_name(left_df)
                right_name = get_index_name(right_df)
                if left_name in left_key_col and right_name in right_key_col:
                    if left_name == right_name:
                        joined_df = SparkDataFrameWithIndex(
                            joined_df, index_name=left_name
                        )
                    else:
                        warnings.warn(f"New data column {right_name}")
                        joined_df = SparkDataFrameWithIndex(
                            joined_df, index_name=left_name
                        )
                elif left_name in left_key_col:
                    joined_df = SparkDataFrameWithIndex(joined_df, index_name=left_name)
                elif right_name in right_key_col:
                    joined_df = SparkDataFrameWithIndex(
                        joined_df, index_name=right_name
                    )
                else:
                    pass
            elif _is_spark_with_index(left_df):
                index_name = get_index_name(left_df)
                if index_name in left_key_col:
                    joined_df = SparkDataFrameWithIndex(
                        joined_df, index_name=index_name
                    )
            elif _is_spark_with_index(right_df):
                index_name = get_index_name(right_df)
                if index_name in right_key_col:
                    joined_df = SparkDataFrameWithIndex(
                        joined_df, index_name=index_name
                    )
            else:
                assert False
            return joined_df

        # Iterate over all the elements of the predicate
        for pred_element in self.pred if self.pred is not None else []:
            left_table_name = ""
            left_key_col = []
            right_table_name = ""
            right_key_col = []
            if isinstance(pred_element, list):
                # Prepare composite key to apply join once for all the participating columns together
                for sub_pred_element in pred_element:
                    (
                        left_table_name,
                        temp_left_key,
                        right_table_name,
                        temp_right_key,
                    ) = self._get_join_info(sub_pred_element._expr)
                    left_key_col.extend(temp_left_key)
                    right_key_col.extend(temp_right_key)
            else:
                (
                    left_table_name,
                    left_key_col,
                    right_table_name,
                    right_key_col,
                ) = self._get_join_info(pred_element._expr)
            left_df, right_df = fetch_df(left_table_name, right_table_name)
            if not _is_df(left_df) or not _is_df(right_df):
                raise ValueError(
                    "ERROR: Cannot perform join operation, either '{}' or '{}' table not present in input X!".format(
                        left_table_name, right_table_name
                    )
                )
            left_df = remove_implicit_col(left_key_col, left_df)
            right_df = remove_implicit_col(right_key_col, right_df)
            columns_in_both_tables = set(get_columns(left_df)).intersection(  # type: ignore
                set(get_columns(right_df))  # type: ignore
            )
            if columns_in_both_tables and not set(
                sorted(columns_in_both_tables)
            ) == set(sorted(left_key_col + right_key_col)):
                raise ValueError(
                    "Cannot perform join operation! Non-key columns cannot be duplicate."
                )
            joined_df = join_df(left_df, right_df)
            if _is_spark_with_index(left_df) or _is_spark_with_index(right_df):
                joined_df = add_index(
                    left_key_col, left_df, right_key_col, right_df, joined_df
                )
            tables_encountered.add(left_table_name)
            tables_encountered.add(right_table_name)
        return add_table_name(joined_df, self.name)