Example #1
0
def load_db_data_to_data_frame(datasource, select):
    """
    Load database data to a pandas.DataFrame.

    Args:
        datasource (str): the database connection URI.
        select (str): the select SQL statement.

    Returns:
        A pandas.DataFrame object which contains all queried data.
    """
    conn = db.connect_with_data_source(datasource)
    generator = verifier.fetch_samples(conn, select, n=-1)
    names = generator.field_names
    dtypes = []
    for dtype in generator.field_types:
        if dtype in ['VARCHAR', 'CHAR', 'TEXT', 'STRING']:
            dtypes.append(np.str)
        else:
            dtypes.append(np.float64)

    df = pd.DataFrame(columns=names)
    for i, rows in enumerate(generator()):
        df.loc[i] = rows

    for name, dtype in zip(names, dtypes):
        df[name] = df[name].astype(dtype)

    conn.close()
    return df
Example #2
0
def infer_feature_columns(conn, select, features, label, n=1000):
    """
    Infer the FeatureColumns.

    Args:
        conn: the database connection object.
        select (str): the select SQL statement.
        features (dict[str -> list[FeatureColumn]]): the input feature
            columns. The key of the dict is the target name, e.g.
            "feature_columns".
        label (FeatureColumn): the FeatureColumn object of the label.
        n (int): the sample number to be fetched in the table. Default
            1000.

    Returns:
        A tuple of (new_features, new_label), which can be accepted by IR.
    """
    if features is None:
        features = {}

    fc_map = make_feature_column_map(features)
    fd_map = make_field_desc_map(features)

    generator = fetch_samples(conn, select, n)
    if generator is None:
        raise ValueError("empty dataset")

    selected_field_names = generator.field_names
    assert len(set(selected_field_names)) == len(selected_field_names), \
        "duplicate selected field names"

    for name in selected_field_names:
        if name not in fd_map:
            fd_map[name] = new_default_field_desc(name)

    fill_field_descs(generator, fd_map)
    label_name = label.get_field_desc()[0].name if label is not None else None

    targets = list(features.keys())
    if not targets:
        targets.append("feature_columns")

    derive_feature_columns(targets, fc_map, fd_map, selected_field_names,
                           label_name)
    features = update_ir_feature_columns(features, fc_map,
                                         selected_field_names, label_name)
    label = derive_label(label, fd_map)
    return features, label
Example #3
0
    def test_fetch_sample(self):
        conn = testing.get_singleton_db_connection()

        select = "SELECT * FROM iris.train"
        name_and_type = db.selected_columns_and_types(conn, select)
        expect_field_names = [item[0] for item in name_and_type]
        expect_field_types = [item[1] for item in name_and_type]
        column_num = len(name_and_type)

        gen = fetch_samples(conn, select, n=0)
        self.assertTrue(gen is None)

        gen = fetch_samples(conn, select, n=-1)
        row_num = length(gen())
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertGreater(row_num, 25)

        gen = fetch_samples(conn, select, n=25)
        n = 0

        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))

        for rows in gen():
            self.assertEqual(len(rows), column_num)
            n += 1

        self.assertEqual(n, 25)

        gen = fetch_samples(conn, select, n=10)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), 10)

        gen = fetch_samples(conn, "%s LIMIT 1" % select, n=1000)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), 1)

        gen = fetch_samples(conn, select, n=row_num * 2)
        self.assertTrue(np.array_equal(gen.field_names, expect_field_names))
        self.assertTrue(np.array_equal(gen.field_types, expect_field_types))
        self.assertEqual(length(gen()), row_num)