Esempio n. 1
0
# In[101]:

a1

# In[106]:

sorted(df['PassengerId'])

# In[107]:

df.columns

# In[109]:

newp = pd.Categorical(df['Pclass'])

# In[110]:

newp

# In[111]:

df['Cabin']

# In[112]:

import numpy as np

char_cabin = df['Cabin'].astype(str)
Esempio n. 2
0
        df3 = read_parquet(tmp, columns=['f', 'i32'], engine=engine)
        assert_eq(df3, df3, check_index=False)


@pytest.mark.parametrize('df,write_kwargs,read_kwargs', [
    (pd.DataFrame({'x': [3, 2, 1]}), {}, {}),
    (pd.DataFrame({'x': ['c', 'a', 'b']}), {
        'object_encoding': 'utf8'
    }, {}),
    (pd.DataFrame({'x': ['cc', 'a', 'bbb']}), {
        'object_encoding': 'utf8'
    }, {}),
    (pd.DataFrame({'x': [b'a', b'b', b'c']}), {
        'object_encoding': 'bytes'
    }, {}),
    (pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'])}), {
        'object_encoding': 'utf8'
    }, {
        'categories': ['x']
    }),
    (pd.DataFrame({'x': pd.Categorical([1, 2, 1])}), {}, {
        'categories': ['x']
    }),
    (pd.DataFrame({'x': list(map(pd.Timestamp, [3000, 2000, 1000]))}), {}, {}),
    (pd.DataFrame({
        'x': [3000, 2000, 1000]
    }).astype('M8[ns]'), {}, {}),
    pytest.mark.xfail((pd.DataFrame({
        'x': [3, 2, 1]
    }).astype('M8[ns]'), {}, {}),
                      reason="Parquet doesn't support nanosecond precision"),
Esempio n. 3
0
import matplotlib.pyplot as pyt
from scipy.stats import linregress

pd.set_option("max_rows", 999)
pd.set_option("max_columns", 999)

# Setting up the Input Path:
Input_Path = r"F:/OFZ/OneDrive - Anheuser-Busch InBev/_MUKIL_/00_WORK/00_PROJECTS/14_CHURN_PREDICTION_ONTRADE_UK/03.Output/Model_Input/"
Output_Path = r"F:/OFZ/OneDrive - Anheuser-Busch InBev/_MUKIL_/00_WORK/00_PROJECTS/14_CHURN_PREDICTION_ONTRADE_UK/03.Output/Model_Results/"
# %%
### Importing the data:
Data = pd.read_csv(Input_Path +
                   'POC_Churn_Status_20082020.csv').drop(columns='key')
Data.sort_values(by=['Outlet Id', 'Year', 'Month'], inplace=True)
Data_Trans = Data.copy(deep=True)
Data_Trans["Year_Month"] = pd.Categorical(Data_Trans["Year_Month"])

# %%
Raw_Data = pd.read_csv(Input_Path + 'Unpivoted_Data_Latest_25082020.csv')
Raw_Data.head()

# %%
### EDA:
print('Total Number of POCs by Year: ',
      Data[Data["Start_Restart"] != 'Inactive']["Outlet Id"].nunique())
print("Volume by Year: ")
Data.groupby(["Year"], as_index=False).agg({"Volume": sum})


# %%
### Functions:
a2 = dat1.pivot_table(index=['Policy_Number'],columns='Main_Insurance_Coverage_Group',\
                      values=['Insured_Amount1', 'Insured_Amount2', 'Insured_Amount3',\
                              'Coverage_Deductible_if_applied'],fill_value=0)
a3 = dat1.groupby(by ='Policy_Number',axis=0,sort=False).Insurance_Coverage.value_counts().\
                reset_index(name='Insurance_Coverage_count')
a3 = a3.pivot_table(index='Policy_Number', columns='Insurance_Coverage',\
                    values='Insurance_Coverage_count',fill_value=0)

# 補缺失值
dat1.Vehicle_identifier = dat1.Vehicle_identifier.fillna(dat1.Policy_Number)
dat1.Prior_Policy_Number = dat1.Prior_Policy_Number.fillna('0')

dat1.count()[dat1.count()<1747942]

# 將 Insured's_ID 轉數字
dat1["Insured's_ID"] = pd.Categorical(dat1["Insured's_ID"])
dat1["Insured's_ID"] = dat1["Insured's_ID"].cat.codes

dat1.Vehicle_identifier = pd.Categorical(dat1.Vehicle_identifier)
dat1.Vehicle_identifier = dat1.Vehicle_identifier.cat.codes
dat1.Vehicle_Make_and_Model1 = pd.Categorical(dat1.Vehicle_Make_and_Model1)
dat1.Vehicle_Make_and_Model1 = dat1.Vehicle_Make_and_Model1.cat.codes
dat1.Vehicle_Make_and_Model2 = pd.Categorical(dat1.Vehicle_Make_and_Model2)
dat1.Vehicle_Make_and_Model2 = dat1.Vehicle_Make_and_Model2.cat.codes
dat1.Distribution_Channel = pd.Categorical(dat1.Distribution_Channel)
dat1.Distribution_Channel = dat1.Distribution_Channel.cat.codes
dat1.aassured_zip = pd.Categorical(dat1.aassured_zip)
dat1.aassured_zip = dat1.aassured_zip.cat.codes
dat1.iply_area = pd.Categorical(dat1.iply_area)
dat1.iply_area = dat1.iply_area.cat.codes
dat1.Prior_Policy_Number = pd.Categorical(dat1.Prior_Policy_Number)
Esempio n. 5
0
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
iris = pd.read_table("iris.txt", sep=',', names=('SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm','PetalWidthCm','Species'))
print (iris.head())


X = iris.drop('Species',axis=1).values
y = pd.Categorical(iris['Species']).codes

from sklearn.cluster import KMeans
estimators = {'k_means_iris_3': KMeans(n_clusters=3),
              'k_means_iris_8': KMeans(n_clusters=8),
              'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1,
                                              init='random')}
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

fignum = 1
for name, est in estimators.items():
    fig = plt.figure(fignum, figsize=(8, 6))
    plt.clf()
    ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

    plt.cla()
    est.fit(X)
    labels = est.labels_

    ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))

    ax.w_xaxis.set_ticklabels([])
    ax.w_yaxis.set_ticklabels([])
Esempio n. 6
0
 def test_constructor_empty_boolean(self):
     # see gh-22702
     cat = pd.Categorical([], categories=[True, False])
     categories = sorted(cat.categories.tolist())
     assert categories == [False, True]
Esempio n. 7
0
def corpus_to_df(path, metadata_columns):
    def load_corpus_text(path_full, column_name):
        _, filenames, _ = loadCorpus(path_full)
        texts = []
        news = []
        unicodes_to_strip = {
            "\n\n": " ",
            "\n": " ",
            "\ufeff": "",
            "\x85": "",
            "\x91": "",
            "\x92": "",
            "\x93": "",
            "\x94": "",
            "\x96": "",
            "\x97": "",
            "\t": ""
        }
        #print(key for key in unicodes_to_strip)
        for file_name in filenames:
            with open(file_name, 'r', encoding='utf8') as text:
                news = text.read()
                for key in unicodes_to_strip:
                    news = news.replace(key, unicodes_to_strip[key])
                texts.append(news)

        text_df = pd.DataFrame(texts, columns=[column_name])

        return text_df

    def load_meta(path, metadata_columns):
        meta_ids = []
        meta_filenames = []
        meta_tags = []

        for filename in os.listdir(
                os.path.join(path, 'full_texts', 'true-meta-information')):
            meta_ids.append(filename.replace('-meta.txt', '-REAL'))
            meta_filenames.append(
                os.path.join(path, 'full_texts', 'true-meta-information',
                             filename))
            meta_tags.append('REAL')

        # From the fake news folder
        for filename in os.listdir(
                os.path.join(path, 'full_texts', 'fake-meta-information')):
            meta_ids.append(filename.replace('-meta.txt', '-FAKE'))
            meta_filenames.append(
                os.path.join(path, 'full_texts', 'fake-meta-information',
                             filename))
            meta_tags.append('FAKE')

        meta_ids, meta_filenames, meta_tags = (list(t) for t in zip(
            *sorted(zip(meta_ids, meta_filenames, meta_tags))))

        meta_ids = pd.DataFrame(meta_ids, columns=['Id'])
        meta_tags = pd.DataFrame(meta_tags, columns=['Tag'])

        metadatas = []
        for filename in meta_filenames:
            with open(filename, 'r', encoding='utf8') as text:
                metadatas.append(text.read().splitlines())

        data_df = pd.DataFrame(metadatas, columns=metadata_columns)
        meta_df = pd.concat([meta_ids, data_df, meta_tags], axis=1)

        #print(meta_df.head())
        #print(metadata_columns)
        return meta_df

    news_text_full_df = load_corpus_text(os.path.join(path, 'full_texts'),
                                         'news_text_full')
    news_text_normalized_df = load_corpus_text(
        os.path.join(path, 'size_normalized_texts'), 'news_text_normalized')
    news_meta_df = load_meta(path, metadata_columns)

    result_df = pd.concat(
        [news_text_full_df, news_text_normalized_df, news_meta_df], axis=1)
    #print(result_df)
    #print(ns.natsorted(result_df['Id'].unique()))

    result_df['Id'] = pd.Categorical(result_df['Id'],
                                     ordered=True,
                                     categories=ns.natsorted(
                                         result_df['Id'].unique()))
    result_df = result_df.sort_values('Id')

    result_df = result_df.set_index('Id')

    return result_df
Esempio n. 8
0
import pandas as pd
import seaborn as sns
import sys
import numpy as np
 
img =  mpimg.imread('floorplan2.png')
#img = cv2.imread('floorplan-n.png')

data = pd.read_csv('input.csv')
 
# Transform it to a long format
df=data.unstack().reset_index()
df.columns=["X","Y","Z"]
 
# And transform the old column name in something numeric
df['X']=pd.Categorical(df['X'])
df['X']=df['X'].cat.codes
 
# Make the plot
fig = plt.figure()
ax = fig.gca(projection='3d')


ax.set_zlim3d(45, 80)
surf=ax.plot_trisurf(df['Y'], df['X'], df['Z'], cmap=plt.cm.jet, linewidth=0.2, vmin=60, vmax=80)


height, width = img.shape[:2]
# 10 is equal length of x and y axises of your surface
stepX, stepY = 64.0/width, 64.0/height
Esempio n. 9
0
s = pd.Series([1,3,5,np.nan,6,8])
print(s)


#日付のSeriesを作成
#2019/01/01から6日間
dates1 = pd.date_range('20190101', periods=6)
print(dates1)


#A:数値、B:日付、C:1、D:3、E:test,train、F:fooのデータフレーム
df = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20130102'),
                    'C':pd.Series(1,index=list(range(4)), dtype='float32'),
                    'D':np.array([3]*4, dtype='int32'),
                    'E':pd.Categorical(["test", "train", "test", "train"]),
                    'F':'foo'})
print(df)

#各列の型の確認
print("type of df:")
print(df.dtypes)


#行列をデータフレームに変換
matrix = np.random.randn(6,4) #6行4列
df2 = pd.DataFrame(matrix, columns=list('ABCD')) #内容:matrix、列名:columns
print(df2)

#先頭の抽出
#先頭3行
Esempio n. 10
0
              check_index=False, check_divisions=should_check_divs(engine))

    df3 = dd.read_parquet(tmp,
                          columns=['f', 'i32'],
                          engine=engine,
                          infer_divisions=should_check_divs(engine))
    assert_eq(df[['f', 'i32']], df3,
              check_index=False, check_divisions=should_check_divs(engine))


@pytest.mark.parametrize('df,write_kwargs,read_kwargs', [
    (pd.DataFrame({'x': [3, 2, 1]}), {}, {}),
    (pd.DataFrame({'x': ['c', 'a', 'b']}), {'object_encoding': 'utf8'}, {}),
    (pd.DataFrame({'x': ['cc', 'a', 'bbb']}), {'object_encoding': 'utf8'}, {}),
    (pd.DataFrame({'x': [b'a', b'b', b'c']}), {'object_encoding': 'bytes'}, {}),
    (pd.DataFrame({'x': pd.Categorical(['a', 'b', 'a'])}),
     {'object_encoding': 'utf8'}, {'categories': ['x']}),
    (pd.DataFrame({'x': pd.Categorical([1, 2, 1])}), {}, {'categories': ['x']}),
    (pd.DataFrame({'x': list(map(pd.Timestamp, [3000, 2000, 1000]))}), {}, {}),
    (pd.DataFrame({'x': [3000, 2000, 1000]}).astype('M8[ns]'), {}, {}),
    pytest.mark.xfail((pd.DataFrame({'x': [3, 2, 1]}).astype('M8[ns]'), {}, {}),
                      reason="Parquet doesn't support nanosecond precision"),
    (pd.DataFrame({'x': [3, 2, 1]}).astype('M8[us]'), {}, {}),
    (pd.DataFrame({'x': [3, 2, 1]}).astype('M8[ms]'), {}, {}),
    (pd.DataFrame({'x': [3, 2, 1]}).astype('uint16'), {}, {}),
    (pd.DataFrame({'x': [3, 2, 1]}).astype('float32'), {}, {}),
    (pd.DataFrame({'x': [3, 1, 2]}, index=[3, 2, 1]), {}, {}),
    (pd.DataFrame({'x': [3, 1, 5]}, index=pd.Index([1, 2, 3], name='foo')), {}, {}),
    (pd.DataFrame({'x': [1, 2, 3],
                   'y': [3, 2, 1]}), {}, {}),
    (pd.DataFrame({'x': [1, 2, 3],
df2 = pd.read_csv("cluster1.csv")
df2 = df2.fillna(0)

# In[10]:

### cluster_ transfer the string data into the numerical type
name = []
name = [
    "CountryCitizen", "CountryLive", "EmploymentField", "EmploymentStatus",
    "JobApplyWhen", "JobPref", "JobRoleInterest", "JobWherePref",
    "LanguageAtHome", "MaritalStatus", "SchoolDegree", "SchoolMajor"
]

for i in name:
    df2[i] = pd.Categorical(df2[i])
    df2[i] = df2[i].cat.codes

# In[71]:

### cluster_ normalize the data
X = np.array(df2)
X = StandardScaler().fit_transform(X)
print(X.shape)

# In[97]:

### try different numbers of clustering
data_num = X.shape[0]
err_clustering = np.zeros([21, 1])
import feather
import math  # built in
import matplotlib.pyplot as plt
import numpy as np
import os  # built in
import pandas as pd
import random  # built in

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

c4 = pd.read_csv('../data/raw/c4-game-database.csv')
c4.dropna(inplace=True)
for p in range(1, 43):
    c4[f"pos_{p:02d}"] = pd.Categorical(c4[f"pos_{p:02d}"].astype(int))

c4['winner'] = pd.Categorical(c4['winner'].astype(int))

c4.to_csv('../data/processed/c4-game-database.csv', index=False)

# X = ttt.iloc[:, 0:9].values
# y = ttt.iloc[:, 9:10].values

# # Encode categorical variables as numeric
# labelencoder_X = LabelEncoder()
# for _ in range(9):
#     X[:, _] = labelencoder_X.fit_transform(X[:, _])

# # Onehot encode all dependent categorical variables
# onehotencoder = OneHotEncoder(categorical_features = [0,1,2,3,4,5,6,7,8])
Esempio n. 13
0
def tocategory(col):
    return pd.Categorical(col)
Esempio n. 14
0
def process_week(config, source, week_file):
    """Process a single week file

    * Retrieve the file, extracting the photon and spacecraft info
    * Select photons near the source,
    * Determine exposure for the direction
    * Use the weight table to add weights to photon data, selecting photons with weight info
    -- in progress --
    * Use the exposure to assign an exposure to each photon.


    """

    with open(week_file, 'rb') as inp:
        week = pickle.load(inp)

    pdf = _get_photons_near_source(config, source, week)
    edf = _calculate_exposure_for_source(config, source, week)
    if config.verbose > 2:
        print(f'\n\t-->Selected {len(pdf)} photons')

    # add weights
    if pdf is None or len(pdf) < 3 or len(edf) == 0:
        return None, edf

    add_weights(config, pdf, source)

    if 'run_id' in pdf:
        # expint = np.empty(2*len(edf))
        estart = edf.start.values
        estop = edf.stop.values
        exptime = np.append(estart, estop[-1])
        expval = edf.exp.values
        expcth = edf.cos_theta.values

        # corresponding cumulative exposure -- in m^2
        cumexp = np.insert(np.cumsum(edf.exp.values / 1e4), 0, 0)
        # i = np.searchsorted(expint[0::2], MJD(pdf.iloc[0].run_id) )

        runs = pdf.groupby('run_id')
        last_run = 0
        tau = []
        time = []
        run_id = []
        for run, g in runs:
            assert run > last_run
            run_id += [run] * len(g)
            last_run = run

            # assemble MJD time from run_id and trun
            runstart = MJD(float(run))
            rtime = MJD(float(run) + g.trun * config.offset_size)
            time += list(rtime)

            # cumexp at run start
            run_cumexp = cumexp[np.searchsorted(estart, runstart)]

            # use event times in this run to interpolate table of exposure times, cumexp
            event_cumexp = np.interp(rtime, exptime, cumexp)

            # diffs, from first --> tau
            event_exp = np.diff(np.insert(event_cumexp, 0, run_cumexp))
            tau += list(event_exp)

#             # extract cos_theta at event_time? should interplate maybe
#             cth += expcth[np.searchsorted(rtime, estart )]

# update pdf
        pdf.loc[:, 'tau'] = np.array(tau, np.float32)
        pdf.loc[:, 'time'] = time
        pdf.drop(columns='trun', inplace=True)
        pdf.loc[:, 'run_id'] = pd.Categorical(run_id)

    else:  # zap legacy for now
        for check in 'etime event run_diff rtime run'.split():
            if check in pdf:
                if config.verbose > 2: print(f'remove {check}')
                pdf.drop(columns=check, inplace=True)

    # final attempt to do this
    pdf.loc[:, 'weight'] = pdf['weight'].astype(np.float32)

    return pdf, edf
Esempio n. 15
0
 def test_categorical(self, fp):
     if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"):
         pytest.skip("CategoricalDtype not supported for older fp")
     df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
     check_round_trip(df, fp)
Esempio n. 16
0
#
# plt.plot(X, y, 'ro')
# plt.plot(X, X*lr.coef_ + lr.intercept_)
# plt.grid(True)
# plt.show()
#
# print('배달거리 200m일 때 배달시간\n', lr.predict(np.array([[200]])))

# 다변량 선형회귀 분석
# 흡연여부와 임신주차에 따른 신생아 몸무게 예측

path1 = 'C:/Users/TJ/Google 드라이브/학습자료/프로그래밍/data science/Sample data/r/pregnant.txt'
mother = pd.read_csv(path1, sep='\t', engine='python')
print(mother['Smoke'][:5])

mother['Smoke'] = pd.Categorical(mother['Smoke'])
mother['Smoke'] = mother['Smoke'].cat.codes

print(mother['Smoke'][:5])

# 산점도 그리기
# plt.plot(mother['Week'], mother['Wgt'], 'go')
plt.scatter(mother['Week'], mother['Wgt'], c=mother['Smoke'])
plt.show()

# 선형 회귀식 만들기
lr = LinearRegression()
Xvar = ["Week", "Smoke"]
lr.fit(mother[Xvar], mother['Wgt'])

print('기울기', lr.coef_)  #가중치 weight
Esempio n. 17
0
 def test_constructor_np_strs(self):
     # GH#31499 Hastable.map_locations needs to work on np.str_ objects
     cat = pd.Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
     assert all(isinstance(x, np.str_) for x in cat.categories)
Esempio n. 18
0
    def read_metadata(
        fs,
        paths,
        categories=None,
        index=None,
        gather_statistics=None,
        filters=None,
        split_row_groups=True,
        **kwargs,
    ):
        # Define the dataset object to use for metadata,
        # Also, initialize `parts`.  If `parts` is populated here,
        # then each part will correspond to a file.  Otherwise, each part will
        # correspond to a row group (populated below)
        parts, dataset = _determine_dataset_parts(
            fs, paths, gather_statistics, filters, kwargs.get("dataset", {})
        )
        # Check if the column-chunk file_path's are set in "_metadata".
        # If available, we can use the path to sort the row-groups
        col_chunk_paths = False
        if dataset.metadata:
            col_chunk_paths = all(
                dataset.metadata.row_group(i).column(0).file_path is not None
                for i in range(dataset.metadata.num_row_groups)
            )

        # TODO: Call to `_determine_dataset_parts` uses `pq.ParquetDataset`
        # to define the `dataset` object. `split_row_groups` should be passed
        # to that constructor once it is supported (see ARROW-2801).
        if dataset.partitions is not None:
            partitions = [
                n for n in dataset.partitions.partition_names if n is not None
            ]
            if partitions and dataset.metadata:
                # Dont use dataset.metadata for partitioned datasets, unless
                # the column-chunk metadata includes the `"file_path"`.
                # The order of dataset.metadata.row_group items is often
                # different than the order of `dataset.pieces`.
                if not col_chunk_paths or (
                    len(dataset.pieces) != dataset.metadata.num_row_groups
                ):
                    dataset.schema = dataset.metadata.schema
                    dataset.metadata = None
        else:
            partitions = []

        # Statistics are currently collected at the row-group level only.
        # Therefore, we cannot perform filtering with split_row_groups=False.
        # For "partitioned" datasets, each file (usually) corresponds to a
        # row-group anyway.
        # TODO: Map row-group statistics onto file pieces for filtering.
        #       This shouldn't be difficult if `col_chunk_paths==True`
        if not split_row_groups and not col_chunk_paths:
            if gather_statistics is None and not partitions:
                gather_statistics = False
                if filters:
                    raise ValueError(
                        "Filters not supported with split_row_groups=False "
                        "(unless proper _metadata is available)."
                    )
            if gather_statistics and not partitions:
                raise ValueError(
                    "Statistics not supported with split_row_groups=False."
                    "(unless proper _metadata is available)."
                )

        if dataset.metadata:
            schema = dataset.metadata.schema.to_arrow_schema()
        else:
            schema = dataset.schema.to_arrow_schema()
        columns = None

        has_pandas_metadata = (
            schema.metadata is not None and b"pandas" in schema.metadata
        )

        if has_pandas_metadata:
            pandas_metadata = json.loads(schema.metadata[b"pandas"].decode("utf8"))
            (
                index_names,
                column_names,
                storage_name_mapping,
                column_index_names,
            ) = _parse_pandas_metadata(pandas_metadata)
            if categories is None:
                categories = []
                for col in pandas_metadata["columns"]:
                    if (col["pandas_type"] == "categorical") and (
                        col["name"] not in categories
                    ):
                        categories.append(col["name"])
        else:
            index_names = []
            column_names = schema.names
            storage_name_mapping = {k: k for k in column_names}
            column_index_names = [None]

        if index is None and index_names:
            index = index_names

        if set(column_names).intersection(partitions):
            raise ValueError(
                "partition(s) should not exist in columns.\n"
                "categories: {} | partitions: {}".format(column_names, partitions)
            )

        column_names, index_names = _normalize_index_columns(
            columns, column_names + partitions, index, index_names
        )

        all_columns = index_names + column_names

        pieces = sorted(dataset.pieces, key=lambda piece: natural_sort_key(piece.path))

        # Check that categories are included in columns
        if categories and not set(categories).intersection(all_columns):
            raise ValueError(
                "categories not in available columns.\n"
                "categories: {} | columns: {}".format(categories, list(all_columns))
            )

        dtypes = _get_pyarrow_dtypes(schema, categories)
        dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

        index_cols = index or ()
        meta = _meta_from_dtypes(all_columns, dtypes, index_cols, column_index_names)

        meta = clear_known_categories(meta, cols=categories)
        if (
            gather_statistics is None
            and dataset.metadata
            and dataset.metadata.num_row_groups >= len(pieces)
        ):
            gather_statistics = True
        if not pieces:
            gather_statistics = False

        if filters:
            # Filters may require us to gather statistics
            if gather_statistics is False and partitions:
                warnings.warn(
                    "Filtering with gather_statistics=False. "
                    "Only partition columns will be filtered correctly."
                )
            elif gather_statistics is False:
                raise ValueError("Cannot apply filters with gather_statistics=False")
            elif not gather_statistics:
                gather_statistics = True

        row_groups_per_piece = None
        if gather_statistics:
            # Read from _metadata file
            if dataset.metadata and dataset.metadata.num_row_groups >= len(pieces):
                row_groups = [
                    dataset.metadata.row_group(i)
                    for i in range(dataset.metadata.num_row_groups)
                ]

                # Re-order row-groups by path name if known
                if col_chunk_paths:
                    row_groups = sorted(
                        row_groups,
                        key=lambda row_group: natural_sort_key(
                            row_group.column(0).file_path
                        ),
                    )

                if split_row_groups and len(dataset.paths) == 1:
                    row_groups_per_piece = _get_row_groups_per_piece(
                        pieces, dataset.metadata, dataset.paths[0], fs
                    )
                names = dataset.metadata.schema.names
            else:
                # Read from each individual piece (quite possibly slow).
                row_groups, row_groups_per_piece = _get_md_row_groups(pieces)
                if row_groups:
                    piece = pieces[0]
                    md = piece.get_metadata()
                    names = md.schema.names
                else:
                    gather_statistics = False

        if gather_statistics:
            stats = []
            skip_cols = set()  # Columns with min/max = None detected
            path_last = None
            for ri, row_group in enumerate(row_groups):
                s = {"num-rows": row_group.num_rows, "columns": []}
                for i, name in enumerate(names):
                    if name not in skip_cols:
                        column = row_group.column(i)
                        d = {"name": name}
                        if column.statistics:
                            cs_min = column.statistics.min
                            cs_max = column.statistics.max
                            if not column.statistics.has_min_max:
                                cs_min, cs_max = None, None
                            if None in [cs_min, cs_max] and ri == 0:
                                skip_cols.add(name)
                                continue
                            cs_vals = pd.Series([cs_min, cs_max])
                            d.update(
                                {
                                    "min": cs_vals[0],
                                    "max": cs_vals[1],
                                    "null_count": column.statistics.null_count,
                                }
                            )
                        s["columns"].append(d)
                s["total_byte_size"] = row_group.total_byte_size
                if col_chunk_paths:
                    s["file_path_0"] = row_group.column(0).file_path
                    if not split_row_groups and (s["file_path_0"] == path_last):
                        # Rather than appending a new "row-group", just merge
                        # new `s` statistics into last element of `stats`.
                        # Note that each stats element will now correspond to an
                        # entire file (rather than actual "row-groups")
                        _merge_statistics(stats, s)
                        continue
                    else:
                        path_last = s["file_path_0"]
                stats.append(s)
        else:
            stats = None

        if dataset.partitions:
            for partition in dataset.partitions:
                if isinstance(index, list) and partition.name == index[0]:
                    meta.index = pd.CategoricalIndex(
                        categories=partition.keys, name=index[0]
                    )
                elif partition.name == meta.index.name:
                    meta.index = pd.CategoricalIndex(
                        categories=partition.keys, name=meta.index.name
                    )
                elif partition.name in meta.columns:
                    meta[partition.name] = pd.Categorical(
                        categories=partition.keys, values=[]
                    )

        # Create `parts`
        # This is a list of row-group-descriptor dicts, or file-paths
        # if we have a list of files and gather_statistics=False
        if not parts:
            if split_row_groups and row_groups_per_piece:
                # TODO: This block can be removed after ARROW-2801
                parts = []
                rg_tot = 0
                for i, piece in enumerate(pieces):
                    num_row_groups = row_groups_per_piece[i]
                    for rg in range(num_row_groups):
                        parts.append((piece.path, rg, piece.partition_keys))
                        # Setting file_path here, because it may be
                        # missing from the row-group/column-chunk stats
                        if "file_path_0" not in stats[rg_tot]:
                            stats[rg_tot]["file_path_0"] = piece.path
                        rg_tot += 1
            else:
                parts = [
                    (piece.path, piece.row_group, piece.partition_keys)
                    for piece in pieces
                ]
        parts = [
            {
                "piece": piece,
                "kwargs": {"partitions": dataset.partitions, "categories": categories},
            }
            for piece in parts
        ]

        return (meta, stats, parts)
Esempio n. 19
0
 def test_constructor_string_and_tuples(self):
     # GH 21416
     c = pd.Categorical(
         np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
     expected_index = pd.Index([("a", "b"), ("b", "a"), "c"])
     assert c.categories.equals(expected_index)
Esempio n. 20
0
     ["1H", "2H"],
     np.dtype("timedelta64[ns]"),
     TimedeltaArray._from_sequence(["1H", "2H"]),
 ),
 (
     pd.TimedeltaIndex(["1H", "2H"]),
     np.dtype("timedelta64[ns]"),
     TimedeltaArray._from_sequence(["1H", "2H"]),
 ),
 (
     pd.TimedeltaIndex(["1H", "2H"]),
     None,
     TimedeltaArray._from_sequence(["1H", "2H"]),
 ),
 # Category
 (["a", "b"], "category", pd.Categorical(["a", "b"])),
 (
     ["a", "b"],
     pd.CategoricalDtype(None, ordered=True),
     pd.Categorical(["a", "b"], ordered=True),
 ),
 # Interval
 (
     [pd.Interval(1, 2, "right"),
      pd.Interval(3, 4, "right")],
     "interval",
     IntervalArray.from_tuples([(1, 2), (3, 4)], "right"),
 ),
 # Sparse
 ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
 # IntegerNA