def load_db_data_to_data_frame(datasource, select): """ Load database data to a pandas.DataFrame. Args: datasource (str): the database connection URI. select (str): the select SQL statement. Returns: A pandas.DataFrame object which contains all queried data. """ conn = db.connect_with_data_source(datasource) generator = verifier.fetch_samples(conn, select, n=-1) names = generator.field_names dtypes = [] for dtype in generator.field_types: if dtype in ['VARCHAR', 'CHAR', 'TEXT', 'STRING']: dtypes.append(np.str) else: dtypes.append(np.float64) df = pd.DataFrame(columns=names) for i, rows in enumerate(generator()): df.loc[i] = rows for name, dtype in zip(names, dtypes): df[name] = df[name].astype(dtype) conn.close() return df
def infer_feature_columns(conn, select, features, label, n=1000): """ Infer the FeatureColumns. Args: conn: the database connection object. select (str): the select SQL statement. features (dict[str -> list[FeatureColumn]]): the input feature columns. The key of the dict is the target name, e.g. "feature_columns". label (FeatureColumn): the FeatureColumn object of the label. n (int): the sample number to be fetched in the table. Default 1000. Returns: A tuple of (new_features, new_label), which can be accepted by IR. """ if features is None: features = {} fc_map = make_feature_column_map(features) fd_map = make_field_desc_map(features) generator = fetch_samples(conn, select, n) if generator is None: raise ValueError("empty dataset") selected_field_names = generator.field_names assert len(set(selected_field_names)) == len(selected_field_names), \ "duplicate selected field names" for name in selected_field_names: if name not in fd_map: fd_map[name] = new_default_field_desc(name) fill_field_descs(generator, fd_map) label_name = label.get_field_desc()[0].name if label is not None else None targets = list(features.keys()) if not targets: targets.append("feature_columns") derive_feature_columns(targets, fc_map, fd_map, selected_field_names, label_name) features = update_ir_feature_columns(features, fc_map, selected_field_names, label_name) label = derive_label(label, fd_map) return features, label
def test_fetch_sample(self): conn = testing.get_singleton_db_connection() select = "SELECT * FROM iris.train" name_and_type = db.selected_columns_and_types(conn, select) expect_field_names = [item[0] for item in name_and_type] expect_field_types = [item[1] for item in name_and_type] column_num = len(name_and_type) gen = fetch_samples(conn, select, n=0) self.assertTrue(gen is None) gen = fetch_samples(conn, select, n=-1) row_num = length(gen()) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertGreater(row_num, 25) gen = fetch_samples(conn, select, n=25) n = 0 self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) for rows in gen(): self.assertEqual(len(rows), column_num) n += 1 self.assertEqual(n, 25) gen = fetch_samples(conn, select, n=10) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), 10) gen = fetch_samples(conn, "%s LIMIT 1" % select, n=1000) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), 1) gen = fetch_samples(conn, select, n=row_num * 2) self.assertTrue(np.array_equal(gen.field_names, expect_field_names)) self.assertTrue(np.array_equal(gen.field_types, expect_field_types)) self.assertEqual(length(gen()), row_num)