Beispiel #1
0
 def test_read_csv_with_dtype(self):
     with self.csv_file(self.csv_text) as fn:
         self.assert_eq(ks.read_csv(fn), pd.read_csv(fn), almost=True)
         self.assert_eq(ks.read_csv(fn, dtype=str),
                        pd.read_csv(fn, dtype=str))
         self.assert_eq(ks.read_csv(fn, dtype={'amount': 'int64'}),
                        pd.read_csv(fn, dtype={'amount': 'int64'}))
Beispiel #2
0
def get_gender_feature():
    train_user = ks.read_csv("data/train_preliminary/user.csv")
    train_click_log = ks.read_csv("data/train_preliminary/click_log.csv")
    train_data = train_user.merge(train_click_log, on="user_id", how='inner')
    sql = '''
    select creative_id,
            gender,
            sum(nvl(click_times, 0)) click_times
    from {train_data}
    group by  creative_id, gender
    '''
    age_data = ks.sql(sql, train_data=train_data)
    age_data.cache()
    sql = '''
    SELECT creative_id,
           gender,
           click_times / sum(click_times)
                             OVER (PARTITION BY creative_id  ORDER BY click_times DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) gender_dist
    FROM {age_data}
    '''
    age_dist_data = ks.sql(sql, age_data=age_data)
    age_dist_data.head(10)
    age_dist_data.cache()
    age_dist_pivot = age_dist_data.pivot(index='creative_id',
                                         columns='gender',
                                         values='gender_dist')
    age_dist_pivot.columns = ['gender_' + str(ele) for ele in range(1, 3)]
    age_dist_pivot = age_dist_pivot.reset_index()
    age_dist_pivot.fillna(0, inplace=True)
    age_dist_pivot.to_csv('./data/gender_dist', num_files=1)
Beispiel #3
0
    def test_read_csv_with_comment(self):
        with self.csv_file(self.csv_text_with_comments) as fn:
            expected = pd.read_csv(fn, comment="#")
            actual = ks.read_csv(fn, comment="#")
            self.assertPandasAlmostEqual(expected, actual.toPandas())

            self.assertRaisesRegex(
                ValueError,
                "Only length-1 comment characters supported",
                lambda: ks.read_csv(fn, comment="").show(),
            )
            self.assertRaisesRegex(
                ValueError,
                "Only length-1 comment characters supported",
                lambda: ks.read_csv(fn, comment="##").show(),
            )
            self.assertRaisesRegex(
                ValueError,
                "Only length-1 comment characters supported",
                lambda: ks.read_csv(fn, comment=1),
            )
            self.assertRaisesRegex(
                ValueError,
                "Only length-1 comment characters supported",
                lambda: ks.read_csv(fn, comment=[1]),
            )
def get_test_data():
    '''
    合并测试集
    '''
    test_click_log = ks.read_csv("../data/test/click_log.csv")
    test_ad = ks.read_csv("../data/test/ad.csv")
    test_data = test_click_log.merge(test_ad, on="creative_id")
    return test_data
Beispiel #5
0
def extract_data():
    '''
    Read the raw data 
    '''
    kdf_categories = ks.read_csv('../data/raw_data/disaster_categories.csv')
    kdf_messages = ks.read_csv('../data/raw_data/disaster_messages.csv')

    return kdf_messages,kdf_categories
Beispiel #6
0
    def test_read_csv_with_squeeze(self):
        with self.csv_file(self.csv_text) as fn:
            expected = pd.read_csv(fn, squeeze=True, usecols=["name"])
            actual = ks.read_csv(fn, squeeze=True, usecols=["name"])
            self.assert_eq(expected, actual, almost=True)

            expected = pd.read_csv(fn, squeeze=True, usecols=["name", "amount"])
            actual = ks.read_csv(fn, squeeze=True, usecols=["name", "amount"])
            self.assert_eq(expected, actual, almost=True)
Beispiel #7
0
    def test_read_csv(self):
        with self.csv_file(self.csv_text) as fn:

            def check(header='infer', names=None, usecols=None):
                expected = pd.read_csv(fn,
                                       header=header,
                                       names=names,
                                       usecols=usecols)
                actual = ks.read_csv(fn,
                                     header=header,
                                     names=names,
                                     usecols=usecols)
                self.assertPandasAlmostEqual(expected, actual.toPandas())

            check()
            check(header=None)
            check(header=0)
            check(names=['n', 'a'])
            check(header=0, names=['n', 'a'])
            check(usecols=[1])
            check(usecols=[1, 0])
            check(usecols=['amount'])
            check(usecols=['amount', 'name'])
            check(usecols=[])
            check(usecols=[1, 1])
            check(usecols=['amount', 'amount'])
            check(names=['n', 'a'], usecols=['a'])

            # check with pyspark patch.
            expected = pd.read_csv(fn)
            actual = ks.read_csv(fn)
            self.assertPandasAlmostEqual(expected, actual.toPandas())

            self.assertRaisesRegex(ValueError, 'non-unique',
                                   lambda: ks.read_csv(fn, names=['n', 'n']))
            self.assertRaisesRegex(
                ValueError, 'does not match the number.*3',
                lambda: ks.read_csv(fn, names=['n', 'a', 'b']))
            self.assertRaisesRegex(
                ValueError, 'does not match the number.*3',
                lambda: ks.read_csv(fn, header=0, names=['n', 'a', 'b']))
            self.assertRaisesRegex(ValueError, 'Usecols do not match.*3',
                                   lambda: ks.read_csv(fn, usecols=[1, 3]))
            self.assertRaisesRegex(
                ValueError, 'Usecols do not match.*col',
                lambda: ks.read_csv(fn, usecols=['amount', 'col']))
            self.assertRaisesRegex(ValueError, 'Unknown header argument 1',
                                   lambda: ks.read_csv(fn, header='1'))
            expected_error_message = (
                "'usecols' must either be list-like of all strings, "
                "all unicode, all integers or a callable.")
            self.assertRaisesRegex(
                ValueError, expected_error_message,
                lambda: ks.read_csv(fn, usecols=[1, 'amount']))

            # check with index_col
            expected = pd.read_csv(fn).set_index('name')
            actual = ks.read_csv(fn, index_col='name')
            self.assertPandasAlmostEqual(expected, actual.toPandas())
def get_train_data():
    '''
    合并训练数据集
    '''
    train_user = ks.read_csv("../data/train_preliminary/user.csv")

    train_click_log = ks.read_csv("../data/train_preliminary/click_log.csv")

    train_ad = ks.read_csv("../data/train_preliminary/ad.csv")
    train_data = train_user.merge(train_click_log, on="user_id",
                                  how='inner').merge(train_ad,
                                                     on="creative_id",
                                                     how='inner')
    return train_data
Beispiel #9
0
    def test_read_csv_with_comment(self):
        with self.csv_file(self.csv_text_with_comments) as fn:
            expected = pd.read_csv(fn, comment='#')
            actual = koalas.read_csv(fn, comment='#')
            self.assertPandasAlmostEqual(expected, actual.toPandas())

            self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported',
                                   lambda: koalas.read_csv(fn, comment='').show())
            self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported',
                                   lambda: koalas.read_csv(fn, comment='##').show())
            self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported',
                                   lambda: koalas.read_csv(fn, comment=1))
            self.assertRaisesRegex(ValueError, 'Only length-1 comment characters supported',
                                   lambda: koalas.read_csv(fn, comment=[1]))
Beispiel #10
0
def combine_feature(train=True):
    user_filename = 'data/train_preliminary/user.csv' if train else './data/test/click_log.csv'
    result_filename = './data/combine_feature' if train else './data/combine_feature_test'
    user_df = ks.read_csv(user_filename)
    if not train:
        user_df = ks.sql('select distinct user_id from {user_df}',
                         user_df=user_df)
    wv_feature = ks.read_csv('data/wv_features.csv')
    nn_feature = ks.read_csv('data/nn_features.csv')
    stats_data = ks.read_csv(
        "data/stats_features/part-00000-f6695da4-6d9f-4ba4-80b1-d370e636696b-c000.csv"
    )
    all_features = user_df.merge(wv_feature, on='user_id').merge(
        nn_feature, on='user_id').merge(stats_data, on='user_id')
    print(all_features.shape)
    all_features.to_csv(result_filename, num_files=1)
Beispiel #11
0
 def check(header="infer", names=None, usecols=None, index_col=None):
     expected = pd.read_csv(
         fn, header=header, names=names, usecols=usecols, index_col=index_col
     )
     actual = ks.read_csv(
         fn, header=header, names=names, usecols=usecols, index_col=index_col
     )
     self.assert_eq(expected, actual, almost=True)
Beispiel #12
0
def get_dataframe_koalas(name):
    df = ks.read_csv(
        "https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol="
        + name + "&apikey=WCXVE7BAD668SJHL&datatype=csv")
    df = df.rename(columns={"timestamp": "Date"})
    df = df.set_index(df["Date"])
    df = df.sort_index()
    df = df.drop(columns=["open", "low", "high", "volume"])
    return df
Beispiel #13
0
 def check(header="infer", names=None, usecols=None):
     expected = pd.read_csv(fn,
                            header=header,
                            names=names,
                            usecols=usecols)
     actual = ks.read_csv(fn,
                          header=header,
                          names=names,
                          usecols=usecols)
     self.assertPandasAlmostEqual(expected, actual.toPandas())
def get_ad_dict():
    train_ad = ks.read_csv("../data/train_preliminary/ad.csv")
    test_ad = ks.read_csv("../data/test/ad.csv")
    ad_info = ks.concat([train_ad, test_ad], axis=0)
    ad_info = ad_info.drop_duplicates()
    ad_dict_sql = '''
     select 
       creative_id,
       product_id,
       product_category,
       advertiser_id,
       industry,
       row_number()
       over (partition by product_id, product_category,advertiser_id,industry order by 1 desc) ad_rn
       from {ad_info}
    '''
    ad_info = ks.sql(ad_dict_sql, ad_info=ad_info)
    print(ad_info.nunique())
    ad_info.to_csv('../data/ad_info', index=False, num_files=1)
Beispiel #15
0
    def test_read_csv(self):
        with self.csv_file(self.csv_text) as fn:

            def check(header='infer', names=None, usecols=None):
                expected = pd.read_csv(fn,
                                       header=header,
                                       names=names,
                                       usecols=usecols)
                actual = koalas.read_csv(fn,
                                         header=header,
                                         names=names,
                                         usecols=usecols)
                self.assertPandasAlmostEqual(expected, actual.toPandas())

            check()
            check(header=None)
            check(header=0)
            check(names=['n', 'a'])
            check(header=0, names=['n', 'a'])
            check(usecols=[1])
            check(usecols=[1, 0])
            check(usecols=['amount'])
            check(usecols=['amount', 'name'])
            if LooseVersion("0.20.0") <= LooseVersion(pd.__version__):
                check(usecols=lambda x: x == 'amount')
            check(usecols=[])
            check(usecols=[1, 1])
            check(usecols=['amount', 'amount'])
            if LooseVersion("0.20.0") <= LooseVersion(pd.__version__):
                check(usecols=lambda x: x == 'a')
            check(names=['n', 'a'], usecols=['a'])

            # check with pyspark patch.
            expected = pd.read_csv(fn)
            actual = koalas.read_csv(fn)
            self.assertPandasAlmostEqual(expected, actual.toPandas())

            self.assertRaisesRegex(
                ValueError, 'non-unique',
                lambda: koalas.read_csv(fn, names=['n', 'n']))
            self.assertRaisesRegex(
                ValueError, 'Names do not match.*3',
                lambda: koalas.read_csv(fn, names=['n', 'a', 'b']))
            self.assertRaisesRegex(
                ValueError, 'Names do not match.*3',
                lambda: koalas.read_csv(fn, header=0, names=['n', 'a', 'b']))
            self.assertRaisesRegex(ValueError, 'Usecols do not match.*3',
                                   lambda: koalas.read_csv(fn, usecols=[1, 3]))
            self.assertRaisesRegex(
                ValueError, 'Usecols do not match.*col',
                lambda: koalas.read_csv(fn, usecols=['amount', 'col']))
Beispiel #16
0
 def loadData(self, path=None):
     '''
     This method loads a dataset file as a Pandas DataFrame, assuming that the dataset file is in csv format.
     It also shuffles the loaded dataset as part of data preprocessing.
     '''
     if (path != None):
         path = os.path.join(path, self.dataset_file_name)
     else:
         path = self.dataset_file_name
      
     if self.use_koalas:
         dataset = ks.read_csv(path)  
     else:
         dataset = pd.read_csv(path)
     
     # shuffle data 
     self.dataset = dataset.sample(frac=1.0) 
     
     return self.dataset     
Beispiel #17
0
 def test_read_with_spark_schema(self):
     with self.csv_file(self.csv_text_2) as fn:
         actual = ks.read_csv(
             fn, names="A string, B string, C long, D long, E long")
         expected = pd.read_csv(fn, names=["A", "B", "C", "D", "E"])
         self.assertEqual(repr(expected), repr(actual))
Beispiel #18
0
    def test_read_csv(self):
        with self.csv_file(self.csv_text) as fn:

            def check(header="infer", names=None, usecols=None):
                expected = pd.read_csv(fn,
                                       header=header,
                                       names=names,
                                       usecols=usecols)
                actual = ks.read_csv(fn,
                                     header=header,
                                     names=names,
                                     usecols=usecols)
                self.assertPandasAlmostEqual(expected, actual.toPandas())

            check()
            check(header=None)
            check(header=0)
            check(names=["n", "a"])
            check(header=0, names=["n", "a"])
            check(usecols=[1])
            check(usecols=[1, 0])
            check(usecols=["amount"])
            check(usecols=["amount", "name"])
            check(usecols=[])
            check(usecols=[1, 1])
            check(usecols=["amount", "amount"])
            check(names=["n", "a"], usecols=["a"])

            # check with pyspark patch.
            expected = pd.read_csv(fn)
            actual = ks.read_csv(fn)
            self.assertPandasAlmostEqual(expected, actual.toPandas())

            self.assertRaisesRegex(ValueError, "non-unique",
                                   lambda: ks.read_csv(fn, names=["n", "n"]))
            self.assertRaisesRegex(
                ValueError,
                "does not match the number.*3",
                lambda: ks.read_csv(fn, names=["n", "a", "b"]),
            )
            self.assertRaisesRegex(
                ValueError,
                "does not match the number.*3",
                lambda: ks.read_csv(fn, header=0, names=["n", "a", "b"]),
            )
            self.assertRaisesRegex(ValueError, "Usecols do not match.*3",
                                   lambda: ks.read_csv(fn, usecols=[1, 3]))
            self.assertRaisesRegex(
                ValueError,
                "Usecols do not match.*col",
                lambda: ks.read_csv(fn, usecols=["amount", "col"]),
            )
            self.assertRaisesRegex(ValueError, "Unknown header argument 1",
                                   lambda: ks.read_csv(fn, header="1"))
            expected_error_message = (
                "'usecols' must either be list-like of all strings, "
                "all unicode, all integers or a callable.")
            self.assertRaisesRegex(
                ValueError, expected_error_message,
                lambda: ks.read_csv(fn, usecols=[1, "amount"]))

            # check with index_col
            expected = pd.read_csv(fn).set_index("name")
            actual = ks.read_csv(fn, index_col="name")
            self.assertPandasAlmostEqual(expected, actual.toPandas())
Beispiel #19
0
        def __str__(self):
            return str(self.value)


    print((A(1) + 1) * 2)


    spark = SparkSession.builder.master("local").getOrCreate()
    df = spark.read.format('csv')\
                  .option('header', True)\
                  .option('sep', ';')\
                  .schema("id INT, name STRING, surname STRING, age INT ")\
                  .load('user.csv')

    import databricks.koalas as ks
    df = ks.read_csv('user.csv', sep=";", header=0)
    print(df[df['age'] > 30])

    (df.to_spark())
    df.toPandas()

    #Pandas to koalas
    ks.from_pandas(df)
    #Spark to koalas
    ks.DataFrame(df.to_spark())


    from pyspark.sql.types import *
    schema = StructType(
        [StructField('id', IntegerType()), StructField('name', StringType()), StructField('surname', StringType()),
         StructField('age', IntegerType(), False)])
Beispiel #20
0
 def test_read_csv_with_parse_dates(self):
     self.assertRaisesRegex(ValueError, 'parse_dates',
                            lambda: koalas.read_csv('path', parse_dates=True))
Beispiel #21
0
 def test_read_csv_with_mangle_dupe_cols(self):
     self.assertRaisesRegex(ValueError, 'mangle_dupe_cols',
                            lambda: koalas.read_csv('path', mangle_dupe_cols=False))
Beispiel #22
0
 def test_read_with_spark_schema(self):
     with self.csv_file(self.csv_text_2) as fn:
         actual = koalas.read_csv(fn, names="A string, B string, C long, D long, E long")
         expected = pd.read_csv(fn, names=['A', 'B', 'C', 'D', 'E'])
         self.assertEqual(repr(expected), repr(actual))
Beispiel #23
0
# %%
import numpy as np
import pandas as pd

# %% [markdown]
# ### Read CSV File.

# %%
# location of data
data_path = os.path.join("data",
                         "nyc_restaurant_inspection_results_sample1.csv")

# %%
# import to kolas df
df = ks.read_csv(data_path)

# %%
# import to pandas df
pddf = pd.read_csv(data_path)

# %% [markdown]
# ### Memory usage

# %%
print("koalas memory usage is {m} bytes.".format(m=sys.getsizeof(df)))
print("pandas memory usage is {m:.2f} kilobytes.".format(
    m=sys.getsizeof(pddf) / 10**3))

# %% [markdown]
# ##  Selecting Rows and Columns