def table_to_blockmanager(options, table, memory_pool, nthreads=1, categoricals=None): import pandas.core.internals as _int import pyarrow.lib as lib index_columns = [] columns = [] column_indexes = [] index_arrays = [] index_names = [] schema = table.schema row_count = table.num_rows metadata = schema.metadata columns_metadata = None has_pandas_metadata = metadata is not None and b'pandas' in metadata if has_pandas_metadata: pandas_metadata = json.loads(metadata[b'pandas'].decode('utf8')) index_columns = pandas_metadata['index_columns'] columns = pandas_metadata['columns'] column_indexes = pandas_metadata.get('column_indexes', []) table = _add_any_metadata(table, pandas_metadata) columns_metadata = pandas_metadata.get('columns', None) block_table = table # Build up a list of index columns and names while removing those columns # from the original table logical_index_names = [c['name'] for c in columns[-len(index_columns):]] for raw_name, logical_name in zip(index_columns, logical_index_names): i = schema.get_field_index(raw_name) if i != -1: col = table.column(i) col_pandas = col.to_pandas() values = col_pandas.values if hasattr(values, 'flags') and not values.flags.writeable: # ARROW-1054: in pandas 0.19.2, factorize will reject # non-writeable arrays when calling MultiIndex.from_arrays values = values.copy() index_arrays.append(pd.Series(values, dtype=col_pandas.dtype)) index_names.append( backwards_compatible_index_name(raw_name, logical_name)) block_table = block_table.remove_column( block_table.schema.get_field_index(raw_name)) # Convert an arrow table to Block from the internal pandas API result = lib.table_to_blocks(options, block_table, nthreads, memory_pool) # Construct the individual blocks converting dictionary types to pandas # categorical types and Timestamps-with-timezones types to the proper # pandas Blocks blocks = [] for item in result: block_arr = item['block'] placement = item['placement'] if 'dictionary' in item: cat = pd.Categorical(block_arr, categories=item['dictionary'], ordered=item['ordered'], fastpath=True) block = _int.make_block(cat, placement=placement, klass=_int.CategoricalBlock, fastpath=True) elif 'timezone' in item: dtype = make_datetimetz(item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype, fastpath=True) else: block = _int.make_block(block_arr, placement=placement) blocks.append(block) # Construct the row index if len(index_arrays) > 1: index = pd.MultiIndex.from_arrays(index_arrays, names=index_names) elif len(index_arrays) == 1: index = pd.Index(index_arrays[0], name=index_names[0]) else: index = pd.RangeIndex(row_count) column_strings = [x.name for x in block_table.itercolumns()] if columns_metadata is not None: columns_name_dict = dict( (str(x['name']), x['name']) for x in columns_metadata) columns_values = [ columns_name_dict[y] if y in columns_name_dict.keys() else y for y in column_strings ] else: columns_values = column_strings # If we're passed multiple column indexes then evaluate with # ast.literal_eval, since the column index values show up as a list of # tuples to_pair = ast.literal_eval if len(column_indexes) > 1 else lambda x: (x, ) # Create the column index # Construct the base index if not columns_values: columns = pd.Index(columns_values) else: columns = pd.MultiIndex.from_tuples( list(map(to_pair, columns_values)), names=[col_index['name'] for col_index in column_indexes] or None, ) # if we're reconstructing the index if has_pandas_metadata: # Get levels and labels, and provide sane defaults if the index has a # single level to avoid if/else spaghetti. levels = getattr(columns, 'levels', None) or [columns] labels = getattr(columns, 'labels', None) or [ pd.RangeIndex(len(level)) for level in levels ] # Convert each level to the dtype provided in the metadata levels_dtypes = [(level, col_index.get('numpy_type', level.dtype)) for level, col_index in zip_longest( levels, column_indexes, fillvalue={})] new_levels = [ _level if _level.dtype == _dtype else _level.astype(_dtype) for _level, _dtype in levels_dtypes ] columns = pd.MultiIndex(levels=new_levels, labels=labels, names=columns.names) # ARROW-1751: flatten a single level column MultiIndex for pandas 0.21.0 columns = _flatten_single_level_multiindex(columns) axes = [columns, index] return _int.BlockManager(blocks, axes)
def test_make_index_list_multiple_different_levels(self): idx = self.sel.make_index([['foo', [0, 1, 2]], ['bar']]) assert_index_equal(idx, pd.MultiIndex(levels=[['bar', 'foo'], [0, 1, 2, '']], labels=[[1, 1, 1, 0], [0, 1, 2, 3]]))
def test_get_index_list(self): idx = self.sel.get_index(self.df, [['foo', 'mof', '*']]) assert_index_equal( idx, pd.MultiIndex(levels=[['foo'], ['mof'], [0, 1, 2]], labels=[[0, 0, 0], [0, 0, 0], [0, 1, 2]]))
'spam', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham', 'spam', 'spam'] ac = Counter(actual_labels) pc = Counter(predicted_labels) print( 'Actual counts:', ac.most_common()) print ('Predicted counts:', pc.most_common()) cm = metrics.confusion_matrix(y_true=actual_labels, y_pred=predicted_labels, labels=['spam', 'ham']) print (pd.DataFrame(data=cm, columns=pd.MultiIndex(levels=[['Predicted:'], ['spam', 'ham']], labels=[[0, 0], [0, 1]]), index=pd.MultiIndex(levels=[['Actual:'], ['spam', 'ham']], labels=[[0, 0], [0, 1]])) ) #zande positive_class = 'spam' true_positive = 5. false_positive = 6. false_negative = 5. true_negative = 4.
def test_make_index_str_multiple_different_levels(self): idx = self.sel.make_index('/foo[0:3],/bar') assert_index_equal(idx, pd.MultiIndex(levels=[['bar', 'foo'], [0, 1, 2, '']], labels=[[1, 1, 1, 0], [0, 1, 2, 3]]))
def test_filter_meta_index(test_df): obs = test_df.filter(scenario="scen_b").meta.index exp = pd.MultiIndex(levels=[["model_a"], ["scen_b"]], codes=[[0], [0]], names=["model", "scenario"]) pd.testing.assert_index_equal(obs, exp)
def ds_to_df(inputs): """ Function that converts the dispaset data format into a dictionary of dataframes :param inputs: input file :return: dictionary of dataframes """ sets, parameters = inputs['sets'], inputs['parameters'] # config = parameters['Config']['val'] try: config = inputs['config'] first_day = pd.datetime(config['StartDate'][0], config['StartDate'][1], config['StartDate'][2], 0) last_day = pd.datetime(config['StopDate'][0], config['StopDate'][1], config['StopDate'][2], 23) dates = pd.date_range(start=first_day, end=last_day, freq='1h') timeindex = True except: logging.warn( 'Could not find the start/stop date information in the inputs. Using an integer index' ) dates = range(1, len(sets['z']) + 1) timeindex = False if len(dates) > len(sets['h']): logging.error('The provided index has a length of ' + str(len(dates)) + ' while the data only comprises ' + str(len(sets['h'])) + ' time elements') sys.exit(1) elif len(dates) > len(sets['z']): logging.warn('The provided index has a length of ' + str(len(dates)) + ' while the simulation was designed for ' + str(len(sets['z'])) + ' time elements') elif len(dates) < len(sets['z']): logging.warn('The provided index has a length of ' + str(len(dates)) + ' while the simulation was designed for ' + str(len(sets['z'])) + ' time elements') idx = range(len(dates)) out = {} out['sets'] = sets # Printing each parameter in a separate sheet and workbook: for p in parameters: var = parameters[p] dim = len(var['sets']) if var['sets'][-1] == 'h' and timeindex and dim > 1: # if len(dates) != var['val'].shape[-1]: # sys.exit('The date range in the Config variable (' + str(len(dates)) + ' time steps) does not match the length of the time index (' + str(var['val'].shape[-1]) + ') for variable ' + p) var['firstrow'] = 5 else: var['firstrow'] = 1 if dim == 1: if var['sets'][0] == 'h': out[p] = pd.DataFrame(var['val'][idx], columns=[p], index=dates) else: out[p] = pd.DataFrame(var['val'], columns=[p], index=sets[var['sets'][0]]) elif dim == 2: values = var['val'] list_sets = [sets[var['sets'][0]], sets[var['sets'][1]]] if var['sets'][1] == 'h': out[p] = pd.DataFrame(values.transpose()[idx, :], index=dates, columns=list_sets[0]) else: out[p] = pd.DataFrame(values.transpose(), index=list_sets[1], columns=list_sets[0]) elif dim == 3: list_sets = [ sets[var['sets'][0]], sets[var['sets'][1]], sets[var['sets'][2]] ] values = var['val'] values2 = np.zeros( [len(list_sets[0]) * len(list_sets[1]), len(list_sets[2])]) cols = np.zeros([2, len(list_sets[0]) * len(list_sets[1])]) for i in range(len(list_sets[0])): values2[i * len(list_sets[1]):(i + 1) * len(list_sets[1]), :] = values[i, :, :] cols[0, i * len(list_sets[1]):(i + 1) * len(list_sets[1])] = i cols[1, i * len(list_sets[1]):(i + 1) * len(list_sets[1])] = range(len(list_sets[1])) columns = pd.MultiIndex([list_sets[0], list_sets[1]], cols) if var['sets'][2] == 'h': out[p] = pd.DataFrame(values2.transpose()[idx, :], index=dates, columns=columns) else: out[p] = pd.DataFrame(values2.transpose(), index=list_sets[2], columns=columns) else: logging.error( 'Only three dimensions currently supported. Parameter ' + p + ' has ' + str(dim) + ' dimensions.') sys.exit(1) return out
def read_daily_impressions_insights_into_df(self, year, month): df_page = pd.DataFrame( columns=pd.MultiIndex(levels=[[], []], labels=[[], []])) # The number of people who had any content from your Page or about your Page enter their screen. This includes # posts, check-ins, ads, social information from people who interact with your Page and more. (Unique Users). # Daily unique = self.daily_insights_for_month("page_impressions_unique", year, month) end_time, value = list(), list() for item in unique["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["impressions_unique", "value"] = pd.Series(data=value, index=end_time, name="value") # The number of times any content from your Page or about your Page entered a person's screen. # This includes posts, check-ins, ads, social information from people who interact with your Page and more. # (Total Count). Daily impressions = self.daily_insights_for_month("page_impressions", year, month) end_time, value = list(), list() for item in impressions["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["impressions", "value"] = pd.Series(data=value, index=end_time, name="value") # Total number of people who saw a story about your Page by story type. (Unique Users). Daily by_story_unique = self.daily_insights_for_month( "page_impressions_by_story_type_unique", year, month) end_time, mention, other, fan, page_post, checkin = ( list(), list(), list(), list(), list(), list(), ) for item in by_story_unique["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: mention.append(item["value"]["mention"]) except KeyError: mention.append(0) try: other.append(item["value"]["other"]) except KeyError: other.append(0) try: fan.append(item["value"]["fan"]) except KeyError: fan.append(0) try: page_post.append(item["value"]["page post"]) except KeyError: page_post.append(0) try: checkin.append(item["value"]["checkin"]) except KeyError: checkin.append(0) df_page["impressions_by_story_unique", "mention"] = pd.Series(data=mention, index=end_time, name="mention") df_page["impressions_by_story_unique", "other"] = pd.Series(data=other, index=end_time, name="other") df_page["impressions_by_story_unique", "fan"] = pd.Series(data=fan, index=end_time, name="fan") df_page["impressions_by_story_unique", "page_post"] = pd.Series(data=page_post, index=end_time, name="page_post") df_page["impressions_by_story_unique", "checkin"] = pd.Series(data=checkin, index=end_time, name="checkin") # Total impressions of stories published by a friend about your Page by story type. (Total Count). Daily by_story = self.daily_insights_for_month( "page_impressions_by_story_type", year, month) end_time, mention, other, fan, page_post, checkin = ( list(), list(), list(), list(), list(), list(), ) for item in by_story["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: mention.append(item["value"]["mention"]) except KeyError: mention.append(0) try: other.append(item["value"]["other"]) except KeyError: other.append(0) try: fan.append(item["value"]["fan"]) except KeyError: fan.append(0) try: page_post.append(item["value"]["page post"]) except KeyError: page_post.append(0) try: checkin.append(item["value"]["checkin"]) except KeyError: checkin.append(0) df_page["impressions_by_story", "mention"] = pd.Series(data=mention, index=end_time, name="mention") df_page["impressions_by_story", "other"] = pd.Series(data=other, index=end_time, name="other") df_page["impressions_by_story", "fan"] = pd.Series(data=fan, index=end_time, name="fan") df_page["impressions_by_story", "page_post"] = pd.Series(data=page_post, index=end_time, name="page_post") df_page["impressions_by_story", "checkin"] = pd.Series(data=checkin, index=end_time, name="checkin") # Total Page Reach by user country. (Unique Users). Daily by_country = self.daily_insights_for_month( "page_impressions_by_country_unique", year, month) end_time = list() gb, us, ru, it = list(), list(), list(), list() for item in by_country["data"]: end_time.append(item[0]["values"]["end_time"][:10]) try: gb.append(item[0]["values"]["value"]["GB"]) except KeyError: gb.append(0) except IndexError: gb.append(0) try: us.append(item[0]["values"]["value"]["US"]) except KeyError: us.append(0) except IndexError: us.append(0) try: ru.append(item[0]["values"]["value"]["RU"]) except KeyError: ru.append(0) except IndexError: ru.append(0) df_page["impressions_by_country", "GB"] = pd.Series(data=gb, index=end_time, name="GB") df_page["impressions_by_country", "US"] = pd.Series(data=us, index=end_time, name="US") df_page["impressions_by_country", "RU"] = pd.Series(data=ru, index=end_time, name="RU") # Total Page Reach by age and gender. (Unique Users). Daily by_age_gender_unique = self.daily_insights_for_month( "page_impressions_by_age_gender_unique", year, month) end_time = list() f_13_17, f_18_24, f_25_34, f_35_44, f_45_54, f_55_64 = ( list(), list(), list(), list(), list(), list(), ) f_65_plus = list() m_13_17, m_18_24, m_25_34, m_35_44, m_45_54, m_55_64 = ( list(), list(), list(), list(), list(), list(), ) m_65_plus = list() for item in by_age_gender_unique["data"]: end_time.append(item[0]["values"]["end_time"][:10]) try: f_13_17.append(item[0]["values"]["value"]["F.13-17"]) except KeyError: f_13_17.append(0) except IndexError: f_13_17.append(0) try: f_18_24.append(item[0]["values"]["value"]["F.18-24"]) except KeyError: f_18_24.append(0) except IndexError: f_18_24.append(0) try: f_25_34.append(item[0]["values"]["value"]["F.25-34"]) except KeyError: f_25_34.append(0) except IndexError: f_25_34.append(0) try: f_35_44.append(item[0]["values"]["value"]["F.35-44"]) except KeyError: f_35_44.append(0) except IndexError: f_35_44.append(0) try: f_45_54.append(item[0]["values"]["value"]["F.45-54"]) except KeyError: f_45_54.append(0) except IndexError: f_45_54.append(0) try: f_55_64.append(item[0]["values"]["value"]["F.55-64"]) except KeyError: f_55_64.append(0) except IndexError: f_55_64.append(0) try: f_65_plus.append(item[0]["values"]["value"]["F.65+"]) except KeyError: f_65_plus.append(0) except IndexError: f_65_plus.append(0) try: m_13_17.append(item[0]["values"]["value"]["M.13-17"]) except KeyError: m_13_17.append(0) except IndexError: m_13_17.append(0) try: m_18_24.append(item[0]["values"]["value"]["M.18-24"]) except KeyError: m_18_24.append(0) except IndexError: m_18_24.append(0) try: m_25_34.append(item[0]["values"]["value"]["M.25-34"]) except KeyError: m_25_34.append(0) except IndexError: m_25_34.append(0) try: m_35_44.append(item[0]["values"]["value"]["M.35-44"]) except KeyError: m_35_44.append(0) except IndexError: m_35_44.append(0) try: m_45_54.append(item[0]["values"]["value"]["M.45-54"]) except KeyError: m_45_54.append(0) except IndexError: m_45_54.append(0) try: m_55_64.append(item[0]["values"]["value"]["M.55-64"]) except KeyError: m_55_64.append(0) except IndexError: m_55_64.append(0) try: m_65_plus.append(item[0]["values"]["value"]["M.65+"]) except KeyError: m_65_plus.append(0) except IndexError: m_65_plus.append(0) df_page["impressions_by_age_gender_unique", "F.13-17"] = pd.Series(data=f_13_17, index=end_time, name="F.13-17") df_page["impressions_by_age_gender_unique", "F.18-24"] = pd.Series(data=f_18_24, index=end_time, name="F.18-24") df_page["impressions_by_age_gender_unique", "F.25-34"] = pd.Series(data=f_25_34, index=end_time, name="F.25-34") df_page["impressions_by_age_gender_unique", "F.35-44"] = pd.Series(data=f_35_44, index=end_time, name="F.35-44") df_page["impressions_by_age_gender_unique", "F.45-54"] = pd.Series(data=f_45_54, index=end_time, name="F.45-54") df_page["impressions_by_age_gender_unique", "F.55-64"] = pd.Series(data=f_55_64, index=end_time, name="F.55-64") df_page["impressions_by_age_gender_unique", "F.65+"] = pd.Series(data=f_65_plus, index=end_time, name="F.65+") df_page["impressions_by_age_gender_unique", "M.13-17"] = pd.Series(data=m_13_17, index=end_time, name="M.13-17") df_page["impressions_by_age_gender_unique", "M.18-24"] = pd.Series(data=m_18_24, index=end_time, name="M.18-24") df_page["impressions_by_age_gender_unique", "M.25-34"] = pd.Series(data=m_25_34, index=end_time, name="M.25-34") df_page["impressions_by_age_gender_unique", "M.35-44"] = pd.Series(data=m_35_44, index=end_time, name="M.35-44") df_page["impressions_by_age_gender_unique", "M.45-54"] = pd.Series(data=m_45_54, index=end_time, name="M.45-54") df_page["impressions_by_age_gender_unique", "M.55-64"] = pd.Series(data=m_55_64, index=end_time, name="M.55-64") df_page["impressions_by_age_gender_unique", "M.65+"] = pd.Series(data=m_65_plus, index=end_time, name="M.65+") return df_page
def read_daily_demographics_insights_into_df(self, year, month): df_page = pd.DataFrame( columns=pd.MultiIndex(levels=[[], []], labels=[[], []])) # The total number of people who have liked your Page. (Unique Users). Lifetime fans = self.daily_insights_for_month("page_fans", year, month) end_time, value = list(), list() for item in fans["data"]: end_time.append(item[0]["values"]["end_time"][:10]) try: value.append(item[0]["values"]["value"]) except KeyError: value.append(0) except IndexError: value.append(0) df_page["fans", "value"] = pd.Series(data=value, index=end_time, name="value") # The number of people who liked your Page, broken down by the most common places where people can like # your Page.(Unique Users). Daily fans_by_like_source_unique = self.daily_insights_for_month( "page_fans_by_like_source_unique", year, month) end_time, news_feed, other = list(), list(), list() page_suggestions, restored_likes, search, your_page = ( list(), list(), list(), list(), ) for item in fans_by_like_source_unique["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: news_feed.append(item["value"]["News Feed"]) except KeyError: news_feed.append(0) try: other.append(item["value"]["Other"]) except KeyError: other.append(0) try: page_suggestions.append(item["value"]["Page Suggestions"]) except KeyError: page_suggestions.append(0) try: restored_likes.append( item["value"]["Restored Likes from Reactivated Accounts"]) except KeyError: restored_likes.append(0) try: search.append(item["value"]["Search"]) except KeyError: search.append(0) try: your_page.append(item["value"]["Your Page"]) except KeyError: your_page.append(0) df_page["fans_by_like_source_unique", "news_feed"] = pd.Series(data=news_feed, index=end_time, name="news_feed") df_page["fans_by_like_source_unique", "other"] = pd.Series(data=other, index=end_time, name="other") df_page["fans_by_like_source_unique", "page_suggestions"] = pd.Series(data=page_suggestions, index=end_time, name="page_suggestions") df_page["fans_by_like_source_unique", "restored_likes"] = pd.Series(data=restored_likes, index=end_time, name="restored_likes") df_page["fans_by_like_source_unique", "search"] = pd.Series(data=search, index=end_time, name="search") df_page["fans_by_like_source_unique", "your_page"] = pd.Series(data=your_page, index=end_time, name="your_page") # This is a breakdown of the number of Page likes from the most common places where people can like your Page. # (Total Count). Daily fans_by_like_source = self.daily_insights_for_month( "page_fans_by_like_source", year, month) end_time, news_feed, other = list(), list(), list() page_suggestions, restored_likes, search, your_page = ( list(), list(), list(), list(), ) for item in fans_by_like_source["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: news_feed.append(item["value"]["News Feed"]) except KeyError: news_feed.append(0) try: other.append(item["value"]["Other"]) except KeyError: other.append(0) try: page_suggestions.append(item["value"]["Page Suggestions"]) except KeyError: page_suggestions.append(0) try: restored_likes.append( item["value"]["Restored Likes from Reactivated Accounts"]) except KeyError: restored_likes.append(0) try: search.append(item["value"]["Search"]) except KeyError: search.append(0) try: your_page.append(item["value"]["Your Page"]) except KeyError: your_page.append(0) df_page["fans_by_like_source", "news_feed"] = pd.Series(data=news_feed, index=end_time, name="news_feed") df_page["fans_by_like_source", "other"] = pd.Series(data=other, index=end_time, name="other") df_page["fans_by_like_source", "page_suggestions"] = pd.Series(data=page_suggestions, index=end_time, name="page_suggestions") df_page["fans_by_like_source", "restored_likes"] = pd.Series(data=restored_likes, index=end_time, name="restored_likes") df_page["fans_by_like_source", "search"] = pd.Series(data=search, index=end_time, name="search") df_page["fans_by_like_source", "your_page"] = pd.Series(data=your_page, index=end_time, name="your_page") return df_page
def test_non_cython_api(): # GH5610 # non-cython calls should not include the grouper df = DataFrame( [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]], columns=["A", "B", "C"] ) g = df.groupby("A") gni = df.groupby("A", as_index=False) # mad expected = DataFrame([[0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" result = g.mad() tm.assert_frame_equal(result, expected) expected = DataFrame([[1, 0.0], [3, np.nan]], columns=["A", "B"], index=[0, 1]) result = gni.mad() tm.assert_frame_equal(result, expected) # describe expected_index = Index([1, 3], name="A") expected_col = pd.MultiIndex( levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]], codes=[[0] * 8, list(range(8))], ) expected = DataFrame( [ [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], ], index=expected_index, columns=expected_col, ) result = g.describe() tm.assert_frame_equal(result, expected) expected = pd.concat( [ df[df.A == 1].describe().unstack().to_frame().T, df[df.A == 3].describe().unstack().to_frame().T, ] ) expected.index = Index([0, 1]) result = gni.describe() tm.assert_frame_equal(result, expected) # any expected = DataFrame( [[True, True], [False, True]], columns=["B", "C"], index=[1, 3] ) expected.index.name = "A" result = g.any() tm.assert_frame_equal(result, expected) # idxmax expected = DataFrame([[0.0], [np.nan]], columns=["B"], index=[1, 3]) expected.index.name = "A" result = g.idxmax() tm.assert_frame_equal(result, expected)
class TestToCSV: def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 # # Python's CSV library adds an extraneous '""' # before the newline when the NaN-value is in # the first row. Otherwise, only the newline # character is added. This behavior is inconsistent # and was patched in https://bugs.python.org/pull_request4672. df1 = DataFrame([None, 1]) expected1 = """\ "" 1.0 """ with tm.ensure_clean("test.csv") as path: df1.to_csv(path, header=None, index=None) with open(path) as f: assert f.read() == expected1 df2 = DataFrame([1, None]) expected2 = """\ 1.0 "" """ with tm.ensure_clean("test.csv") as path: df2.to_csv(path, header=None, index=None) with open(path) as f: assert f.read() == expected2 def test_to_csv_defualt_encoding(self): # GH17097 df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) with tm.ensure_clean("test.csv") as path: # the default to_csv encoding is uft-8. df.to_csv(path) tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) def test_to_csv_quotechar(self): df = DataFrame({"col": [1, 2]}) expected = """\ "","col" "0","1" "1","2" """ with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL with open(path) as f: assert f.read() == expected expected = """\ $$,$col$ $0$,$1$ $1$,$2$ """ with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, quotechar="$") with open(path) as f: assert f.read() == expected with tm.ensure_clean("test.csv") as path: with pytest.raises(TypeError, match="quotechar"): df.to_csv(path, quoting=1, quotechar=None) def test_to_csv_doublequote(self): df = DataFrame({"col": ['a"a', '"bb"']}) expected = '''\ "","col" "0","a""a" "1","""bb""" ''' with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL with open(path) as f: assert f.read() == expected from _csv import Error with tm.ensure_clean("test.csv") as path: with pytest.raises(Error, match="escapechar"): df.to_csv(path, doublequote=False) # no escapechar set def test_to_csv_escapechar(self): df = DataFrame({"col": ['a"a', '"bb"']}) expected = """\ "","col" "0","a\\"a" "1","\\"bb\\"" """ with tm.ensure_clean("test.csv") as path: # QUOTE_ALL df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") with open(path) as f: assert f.read() == expected df = DataFrame({"col": ["a,a", ",bb,"]}) expected = """\ ,col 0,a\\,a 1,\\,bb\\, """ with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE with open(path) as f: assert f.read() == expected def test_csv_to_string(self): df = DataFrame({"col": [1, 2]}) expected_rows = [",col", "0,1", "1,2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected def test_to_csv_decimal(self): # see gh-781 df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) expected_rows = [",col1,col2,col3", "0,1,a,10.1"] expected_default = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected_default expected_rows = [";col1;col2;col3", "0;1;a;10,1"] expected_european_excel = tm.convert_rows_list_to_csv_str( expected_rows) assert df.to_csv(decimal=",", sep=";") == expected_european_excel expected_rows = [",col1,col2,col3", "0,1,a,10.10"] expected_float_format_default = tm.convert_rows_list_to_csv_str( expected_rows) assert df.to_csv(float_format="%.2f") == expected_float_format_default expected_rows = [";col1;col2;col3", "0;1;a;10,10"] expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) assert (df.to_csv(decimal=",", sep=";", float_format="%.2f") == expected_float_format) # see gh-11553: testing if decimal is taken into account for '0.0' df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(index=False, decimal="^") == expected # same but for an index assert df.set_index("a").to_csv(decimal="^") == expected # same for a multi-index assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected def test_to_csv_float_format(self): # testing if float_format is taken into account for the index # GH 11553 df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index("a").to_csv(float_format="%.2f") == expected # same for a multi-index assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected def test_to_csv_na_rep(self): # see gh-11553 # # Testing if NaN values are correctly represented in the index. df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # now with an index containing only NaNs df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # check if na_rep parameter does not break anything when no NaN df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0,0,2", "0,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") expected = tm.convert_rows_list_to_csv_str( [",0", "0,a", "1,ZZZZZ", "2,c"]) assert expected == csv def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype): # GH 29975 # Make sure full na_rep shows up when a dtype is provided expected = tm.convert_rows_list_to_csv_str( [",0", "0,a", "1,ZZZZZ", "2,c"]) csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv(na_rep="ZZZZZ") assert expected == csv def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame( {"A": pd.date_range("20130101", periods=5, freq="s")}) df_day = DataFrame( {"A": pd.date_range("20130101", periods=5, freq="d")}) expected_rows = [ ",A", "0,2013-01-01 00:00:00", "1,2013-01-01 00:00:01", "2,2013-01-01 00:00:02", "3,2013-01-01 00:00:03", "4,2013-01-01 00:00:04", ] expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv() == expected_default_sec expected_rows = [ ",A", "0,2013-01-01 00:00:00", "1,2013-01-02 00:00:00", "2,2013-01-03 00:00:00", "3,2013-01-04 00:00:00", "4,2013-01-05 00:00:00", ] expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) assert df_day.to_csv( date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day expected_rows = [ ",A", "0,2013-01-01", "1,2013-01-01", "2,2013-01-01", "3,2013-01-01", "4,2013-01-01", ] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec expected_rows = [ ",A", "0,2013-01-01", "1,2013-01-02", "2,2013-01-03", "3,2013-01-04", "4,2013-01-05", ] expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) assert df_day.to_csv() == expected_default_day assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day # see gh-7791 # # Testing if date_format parameter is taken into account # for multi-indexed DataFrames. df_sec["B"] = 0 df_sec["C"] = 1 expected_rows = ["A,B,C", "2013-01-01,0,1.0"] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) assert df_sec_grouped.mean().to_csv( date_format="%Y-%m-%d") == expected_ymd_sec def test_to_csv_different_datetime_formats(self): # GH#21734 df = DataFrame({ "date": pd.to_datetime("1970-01-01"), "datetime": pd.date_range("1970-01-01", periods=2, freq="H"), }) expected_rows = [ "date,datetime", "1970-01-01,1970-01-01 00:00:00", "1970-01-01,1970-01-01 01:00:00", ] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(index=False) == expected def test_to_csv_date_format_in_categorical(self): # GH#40754 ser = pd.Series( pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d")) ser = ser.astype("category") expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""']) assert ser.to_csv(index=False) == expected ser = pd.Series( pd.date_range(start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin").append(pd.DatetimeIndex([pd.NaT ]))) ser = ser.astype("category") assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) exp_rows = [",1", ",2", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame( [1], columns=pd.MultiIndex.from_arrays([[1], [2]]), index=pd.MultiIndex.from_arrays([[1], [2]]), ) exp_rows = [",,1", ",,2", "1,2,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]])) exp_rows = [",foo", ",bar", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ["foo", "bar", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp @pytest.mark.parametrize( "ind,expected", [ ( pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]), "x,data\n1.0,1\n", ), ( pd.MultiIndex( levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"]), "x,y,data\n1.0,2.0,1\n", ), ], ) @pytest.mark.parametrize("klass", [DataFrame, pd.Series]) def test_to_csv_single_level_multi_index(self, ind, expected, klass): # see gh-19589 obj = klass(pd.Series([1], ind, name="data")) with tm.assert_produces_warning(FutureWarning, match="lineterminator"): # GH#9568 standardize on lineterminator matching stdlib result = obj.to_csv(line_terminator="\n", header=True) assert result == expected def test_to_csv_string_array_ascii(self): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) expected_ascii = """\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ with tm.ensure_clean("str_test.csv") as path: df.to_csv(path, encoding="ascii") with open(path) as f: assert f.read() == expected_ascii def test_to_csv_string_array_utf8(self): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) expected_utf8 = """\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ with tm.ensure_clean("unicode_test.csv") as path: df.to_csv(path, encoding="utf-8") with open(path) as f: assert f.read() == expected_utf8 def test_to_csv_string_with_lf(self): # GH 20353 data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = DataFrame(data) with tm.ensure_clean("lf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode("utf-8") expected_noarg = (b"int,str_lf" + os_linesep + b"1,abc" + os_linesep + b'2,"d\nef"' + os_linesep + b'3,"g\nh\n\ni"' + os_linesep) df.to_csv(path, index=False) with open(path, "rb") as f: assert f.read() == expected_noarg with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' df.to_csv(path, lineterminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' df.to_csv(path, lineterminator="\r\n", index=False) with open(path, "rb") as f: assert f.read() == expected_crlf def test_to_csv_string_with_crlf(self): # GH 20353 data = { "int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"] } df = DataFrame(data) with tm.ensure_clean("crlf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode("utf-8") expected_noarg = (b"int,str_crlf" + os_linesep + b"1,abc" + os_linesep + b'2,"d\r\nef"' + os_linesep + b'3,"g\r\nh\r\n\r\ni"' + os_linesep) df.to_csv(path, index=False) with open(path, "rb") as f: assert f.read() == expected_noarg with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' df.to_csv(path, lineterminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("crlf_test.csv") as path: # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = (b"int,str_crlf\r\n" b"1,abc\r\n" b'2,"d\r\nef"\r\n' b'3,"g\r\nh\r\n\r\ni"\r\n') df.to_csv(path, lineterminator="\r\n", index=False) with open(path, "rb") as f: assert f.read() == expected_crlf def test_to_csv_stdout_file(self, capsys): # GH 21561 df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"]) expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) df.to_csv(sys.stdout, encoding="ascii") captured = capsys.readouterr() assert captured.out == expected_ascii assert not sys.stdout.closed @pytest.mark.xfail( compat.is_platform_windows(), reason=("Especially in Windows, file stream should not be passed" "to csv writer without newline='' option." "(https://docs.python.org/3.6/library/csv.html#csv.writer)"), ) def test_to_csv_write_to_open_file(self): # GH 21696 df = DataFrame({"a": ["x", "y", "z"]}) expected = """\ manual header x y z """ with tm.ensure_clean("test.txt") as path: with open(path, "w") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) with open(path) as f: assert f.read() == expected def test_to_csv_write_to_open_file_with_newline_py3(self): # see gh-21696 # see gh-20353 df = DataFrame({"a": ["x", "y", "z"]}) expected_rows = ["x", "y", "z"] expected = "manual header\n" + tm.convert_rows_list_to_csv_str( expected_rows) with tm.ensure_clean("test.txt") as path: with open(path, "w", newline="") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) with open(path, "rb") as f: assert f.read() == bytes(expected, "utf-8") @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_csv_compression(self, compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only # We'll complete file extension subsequently. filename = "test." filename += icom._compression_to_extension[compression] df = DataFrame({"A": [1]}) to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression with tm.ensure_clean(filename) as path: df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) def test_to_csv_compression_dict(self, compression_only): # GH 26023 method = compression_only df = DataFrame({"ABC": [1]}) filename = "to_csv_compress_as_dict." extension = { "gzip": "gz", "zstd": "zst", }.get(method, method) filename += extension with tm.ensure_clean(filename) as path: df.to_csv(path, compression={"method": method}) read_df = pd.read_csv(path, index_col=0) tm.assert_frame_equal(read_df, df) def test_to_csv_compression_dict_no_method_raises(self): # GH 26023 df = DataFrame({"ABC": [1]}) compression = {"some_option": True} msg = "must have key 'method'" with tm.ensure_clean("out.zip") as path: with pytest.raises(ValueError, match=msg): df.to_csv(path, compression=compression) @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"]) def test_to_csv_zip_arguments(self, compression, archive_name): # GH 26023 df = DataFrame({"ABC": [1]}) with tm.ensure_clean("to_csv_archive_name.zip") as path: df.to_csv(path, compression={ "method": compression, "archive_name": archive_name }) with ZipFile(path) as zp: assert len(zp.filelist) == 1 archived_file = zp.filelist[0].filename assert archived_file == archive_name @pytest.mark.parametrize( "filename,expected_arcname", [ ("archive.csv", "archive.csv"), ("archive.tsv", "archive.tsv"), ("archive.csv.zip", "archive.csv"), ("archive.tsv.zip", "archive.tsv"), ("archive.zip", "archive"), ], ) def test_to_csv_zip_infer_name(self, filename, expected_arcname): # GH 39465 df = DataFrame({"ABC": [1]}) with tm.ensure_clean_dir() as dir: path = Path(dir, filename) df.to_csv(path, compression="zip") with ZipFile(path) as zp: assert len(zp.filelist) == 1 archived_file = zp.filelist[0].filename assert archived_file == expected_arcname @pytest.mark.parametrize("df_new_type", ["Int64"]) def test_to_csv_na_rep_long_string(self, df_new_type): # see gh-25099 df = DataFrame({"c": [float("nan")] * 3}) df = df.astype(df_new_type) expected_rows = ["c", "mynull", "mynull", "mynull"] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") assert expected == result def test_to_csv_timedelta_precision(self): # GH 6783 s = pd.Series([1, 1]).astype("timedelta64[ns]") buf = io.StringIO() s.to_csv(buf) result = buf.getvalue() expected_rows = [ ",0", "0,0 days 00:00:00.000000001", "1,0 days 00:00:00.000000001", ] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_na_rep_truncated(self): # https://github.com/pandas-dev/pandas/issues/31447 result = pd.Series(range(8, 12)).to_csv(na_rep="-") expected = tm.convert_rows_list_to_csv_str( [",0", "0,8", "1,9", "2,10", "3,11"]) assert result == expected result = pd.Series([True, False]).to_csv(na_rep="nan") expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"]) assert result == expected result = pd.Series([1.1, 2.2]).to_csv(na_rep=".") expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) assert result == expected @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) def test_to_csv_errors(self, errors): # GH 22610 data = ["\ud800foo"] ser = pd.Series(data, index=pd.Index(data)) with tm.ensure_clean("test.csv") as path: ser.to_csv(path, errors=errors) # No use in reading back the data as it is not the same anymore # due to the error handling @pytest.mark.parametrize("mode", ["wb", "w"]) def test_to_csv_binary_handle(self, mode): """ Binary file objects should work (if 'mode' contains a 'b') or even without it in most cases. GH 35058 and GH 19827 """ df = tm.makeDataFrame() with tm.ensure_clean() as path: with open(path, mode="w+b") as handle: df.to_csv(handle, mode=mode) tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) @pytest.mark.parametrize("mode", ["wb", "w"]) def test_to_csv_encoding_binary_handle(self, mode): """ Binary file objects should honor a specified encoding. GH 23854 and GH 13068 with binary handles """ # example from GH 23854 content = "a, b, 🐟".encode("utf-8-sig") buffer = io.BytesIO(content) df = pd.read_csv(buffer, encoding="utf-8-sig") buffer = io.BytesIO() df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False) buffer.seek(0) # tests whether file handle wasn't closed assert buffer.getvalue().startswith(content) # example from GH 13068 with tm.ensure_clean() as path: with open(path, "w+b") as handle: DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig") handle.seek(0) assert handle.read().startswith(b'\xef\xbb\xbf""')
class TestToCSV(object): @pytest.mark.xfail((3, 6, 5) > sys.version_info >= (3, 5), reason=("Python csv library bug " "(see https://bugs.python.org/issue32255)")) def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 # # Python's CSV library adds an extraneous '""' # before the newline when the NaN-value is in # the first row. Otherwise, only the newline # character is added. This behavior is inconsistent # and was patched in https://bugs.python.org/pull_request4672. df1 = DataFrame([None, 1]) expected1 = """\ "" 1.0 """ with tm.ensure_clean('test.csv') as path: df1.to_csv(path, header=None, index=None) with open(path, 'r') as f: assert f.read() == expected1 df2 = DataFrame([1, None]) expected2 = """\ 1.0 "" """ with tm.ensure_clean('test.csv') as path: df2.to_csv(path, header=None, index=None) with open(path, 'r') as f: assert f.read() == expected2 def test_to_csv_defualt_encoding(self): # GH17097 df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]}) with tm.ensure_clean('test.csv') as path: # the default to_csv encoding in Python 2 is ascii, and that in # Python 3 is uft-8. if pd.compat.PY2: # the encoding argument parameter should be utf-8 with pytest.raises(UnicodeEncodeError, match='ascii'): df.to_csv(path) else: df.to_csv(path) tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) def test_to_csv_quotechar(self): df = DataFrame({'col': [1, 2]}) expected = """\ "","col" "0","1" "1","2" """ with tm.ensure_clean('test.csv') as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL with open(path, 'r') as f: assert f.read() == expected expected = """\ $$,$col$ $0$,$1$ $1$,$2$ """ with tm.ensure_clean('test.csv') as path: df.to_csv(path, quoting=1, quotechar="$") with open(path, 'r') as f: assert f.read() == expected with tm.ensure_clean('test.csv') as path: with pytest.raises(TypeError, match='quotechar'): df.to_csv(path, quoting=1, quotechar=None) def test_to_csv_doublequote(self): df = DataFrame({'col': ['a"a', '"bb"']}) expected = '''\ "","col" "0","a""a" "1","""bb""" ''' with tm.ensure_clean('test.csv') as path: df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL with open(path, 'r') as f: assert f.read() == expected from _csv import Error with tm.ensure_clean('test.csv') as path: with pytest.raises(Error, match='escapechar'): df.to_csv(path, doublequote=False) # no escapechar set def test_to_csv_escapechar(self): df = DataFrame({'col': ['a"a', '"bb"']}) expected = '''\ "","col" "0","a\\"a" "1","\\"bb\\"" ''' with tm.ensure_clean('test.csv') as path: # QUOTE_ALL df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') with open(path, 'r') as f: assert f.read() == expected df = DataFrame({'col': ['a,a', ',bb,']}) expected = """\ ,col 0,a\\,a 1,\\,bb\\, """ with tm.ensure_clean('test.csv') as path: df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE with open(path, 'r') as f: assert f.read() == expected def test_csv_to_string(self): df = DataFrame({'col': [1, 2]}) expected_rows = [',col', '0,1', '1,2'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected def test_to_csv_decimal(self): # see gh-781 df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) expected_rows = [',col1,col2,col3', '0,1,a,10.1'] expected_default = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected_default expected_rows = [';col1;col2;col3', '0;1;a;10,1'] expected_european_excel = tm.convert_rows_list_to_csv_str( expected_rows) assert df.to_csv(decimal=',', sep=';') == expected_european_excel expected_rows = [',col1,col2,col3', '0,1,a,10.10'] expected_float_format_default = tm.convert_rows_list_to_csv_str( expected_rows) assert df.to_csv(float_format='%.2f') == expected_float_format_default expected_rows = [';col1;col2;col3', '0;1;a;10,10'] expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(decimal=',', sep=';', float_format='%.2f') == expected_float_format # see gh-11553: testing if decimal is taken into account for '0.0' df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) expected_rows = ['a,b,c', '0^0,2^2,1', '1^1,3^3,1'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(index=False, decimal='^') == expected # same but for an index assert df.set_index('a').to_csv(decimal='^') == expected # same for a multi-index assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected def test_to_csv_float_format(self): # testing if float_format is taken into account for the index # GH 11553 df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1}) expected_rows = ['a,b,c', '0,2.20,1', '1,3.30,1'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index('a').to_csv(float_format='%.2f') == expected # same for a multi-index assert df.set_index(['a', 'b']).to_csv(float_format='%.2f') == expected def test_to_csv_na_rep(self): # see gh-11553 # # Testing if NaN values are correctly represented in the index. df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]}) expected_rows = ['a,b,c', '0.0,0,2', '_,1,3'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected # now with an index containing only NaNs df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]}) expected_rows = ['a,b,c', '_,0,2', '_,1,3'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected # check if na_rep parameter does not break anything when no NaN df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]}) expected_rows = ['a,b,c', '0,0,2', '0,1,3'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame( {'A': pd.date_range('20130101', periods=5, freq='s')}) df_day = DataFrame( {'A': pd.date_range('20130101', periods=5, freq='d')}) expected_rows = [ ',A', '0,2013-01-01 00:00:00', '1,2013-01-01 00:00:01', '2,2013-01-01 00:00:02', '3,2013-01-01 00:00:03', '4,2013-01-01 00:00:04' ] expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv() == expected_default_sec expected_rows = [ ',A', '0,2013-01-01 00:00:00', '1,2013-01-02 00:00:00', '2,2013-01-03 00:00:00', '3,2013-01-04 00:00:00', '4,2013-01-05 00:00:00' ] expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) assert (df_day.to_csv( date_format='%Y-%m-%d %H:%M:%S') == expected_ymdhms_day) expected_rows = [ ',A', '0,2013-01-01', '1,2013-01-01', '2,2013-01-01', '3,2013-01-01', '4,2013-01-01' ] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec expected_rows = [ ',A', '0,2013-01-01', '1,2013-01-02', '2,2013-01-03', '3,2013-01-04', '4,2013-01-05' ] expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) assert df_day.to_csv() == expected_default_day assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day # see gh-7791 # # Testing if date_format parameter is taken into account # for multi-indexed DataFrames. df_sec['B'] = 0 df_sec['C'] = 1 expected_rows = ['A,B,C', '2013-01-01,0,1'] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B']) assert (df_sec_grouped.mean().to_csv( date_format='%Y-%m-%d') == expected_ymd_sec) def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) exp_rows = [',1', ',2', '0,1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ['1', '2', '1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]), index=pd.MultiIndex.from_arrays([[1], [2]])) exp_rows = [',,1', ',,2', '1,2,1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ['1', '2', '1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame([1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']])) exp_rows = [',foo', ',bar', '0,1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ['foo', 'bar', '1'] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp @pytest.mark.parametrize("ind,expected", [ (pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]), "x,data\n1.0,1\n"), (pd.MultiIndex(levels=[[1.], [2.]], codes=[[0], [0]], names=["x", "y"]), "x,y,data\n1.0,2.0,1\n") ]) @pytest.mark.parametrize("klass", [pd.DataFrame, pd.Series]) def test_to_csv_single_level_multi_index(self, ind, expected, klass): # see gh-19589 result = klass(pd.Series([1], ind, name="data")).to_csv(line_terminator="\n", header=True) assert result == expected def test_to_csv_string_array_ascii(self): # GH 10813 str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}] df = pd.DataFrame(str_array) expected_ascii = '''\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" ''' with tm.ensure_clean('str_test.csv') as path: df.to_csv(path, encoding='ascii') with open(path, 'r') as f: assert f.read() == expected_ascii @pytest.mark.xfail def test_to_csv_string_array_utf8(self): # GH 10813 str_array = [{'names': ['foo', 'bar']}, {'names': ['baz', 'qux']}] df = pd.DataFrame(str_array) expected_utf8 = '''\ ,names 0,"[u'foo', u'bar']" 1,"[u'baz', u'qux']" ''' with tm.ensure_clean('unicode_test.csv') as path: df.to_csv(path, encoding='utf-8') with open(path, 'r') as f: assert f.read() == expected_utf8 def test_to_csv_string_with_lf(self): # GH 20353 data = {'int': [1, 2, 3], 'str_lf': ['abc', 'd\nef', 'g\nh\n\ni']} df = pd.DataFrame(data) with tm.ensure_clean('lf_test.csv') as path: # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode('utf-8') expected_noarg = (b'int,str_lf' + os_linesep + b'1,abc' + os_linesep + b'2,"d\nef"' + os_linesep + b'3,"g\nh\n\ni"' + os_linesep) df.to_csv(path, index=False) with open(path, 'rb') as f: assert f.read() == expected_noarg with tm.ensure_clean('lf_test.csv') as path: # case 2: LF as line terminator expected_lf = (b'int,str_lf\n' b'1,abc\n' b'2,"d\nef"\n' b'3,"g\nh\n\ni"\n') df.to_csv(path, line_terminator='\n', index=False) with open(path, 'rb') as f: assert f.read() == expected_lf with tm.ensure_clean('lf_test.csv') as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element expected_crlf = (b'int,str_lf\r\n' b'1,abc\r\n' b'2,"d\nef"\r\n' b'3,"g\nh\n\ni"\r\n') df.to_csv(path, line_terminator='\r\n', index=False) with open(path, 'rb') as f: assert f.read() == expected_crlf def test_to_csv_string_with_crlf(self): # GH 20353 data = { 'int': [1, 2, 3], 'str_crlf': ['abc', 'd\r\nef', 'g\r\nh\r\n\r\ni'] } df = pd.DataFrame(data) with tm.ensure_clean('crlf_test.csv') as path: # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode('utf-8') expected_noarg = (b'int,str_crlf' + os_linesep + b'1,abc' + os_linesep + b'2,"d\r\nef"' + os_linesep + b'3,"g\r\nh\r\n\r\ni"' + os_linesep) df.to_csv(path, index=False) with open(path, 'rb') as f: assert f.read() == expected_noarg with tm.ensure_clean('crlf_test.csv') as path: # case 2: LF as line terminator expected_lf = (b'int,str_crlf\n' b'1,abc\n' b'2,"d\r\nef"\n' b'3,"g\r\nh\r\n\r\ni"\n') df.to_csv(path, line_terminator='\n', index=False) with open(path, 'rb') as f: assert f.read() == expected_lf with tm.ensure_clean('crlf_test.csv') as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element expected_crlf = (b'int,str_crlf\r\n' b'1,abc\r\n' b'2,"d\r\nef"\r\n' b'3,"g\r\nh\r\n\r\ni"\r\n') df.to_csv(path, line_terminator='\r\n', index=False) with open(path, 'rb') as f: assert f.read() == expected_crlf def test_to_csv_stdout_file(self, capsys): # GH 21561 df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']], columns=['name_1', 'name_2']) expected_rows = [',name_1,name_2', '0,foo,bar', '1,baz,qux'] expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) df.to_csv(sys.stdout, encoding='ascii') captured = capsys.readouterr() assert captured.out == expected_ascii assert not sys.stdout.closed @pytest.mark.xfail( compat.is_platform_windows(), reason=("Especially in Windows, file stream should not be passed" "to csv writer without newline='' option." "(https://docs.python.org/3.6/library/csv.html#csv.writer)")) def test_to_csv_write_to_open_file(self): # GH 21696 df = pd.DataFrame({'a': ['x', 'y', 'z']}) expected = '''\ manual header x y z ''' with tm.ensure_clean('test.txt') as path: with open(path, 'w') as f: f.write('manual header\n') df.to_csv(f, header=None, index=None) with open(path, 'r') as f: assert f.read() == expected @pytest.mark.skipif(compat.PY2, reason="Test case for python3") def test_to_csv_write_to_open_file_with_newline_py3(self): # see gh-21696 # see gh-20353 df = pd.DataFrame({'a': ['x', 'y', 'z']}) expected_rows = ["x", "y", "z"] expected = ("manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows)) with tm.ensure_clean('test.txt') as path: with open(path, 'w', newline='') as f: f.write('manual header\n') df.to_csv(f, header=None, index=None) with open(path, 'rb') as f: assert f.read() == bytes(expected, 'utf-8') @pytest.mark.skipif(compat.PY3, reason="Test case for python2") def test_to_csv_write_to_open_file_with_newline_py2(self): # see gh-21696 # see gh-20353 df = pd.DataFrame({'a': ['x', 'y', 'z']}) expected_rows = ["x", "y", "z"] expected = ("manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows)) with tm.ensure_clean('test.txt') as path: with open(path, 'wb') as f: f.write('manual header\n') df.to_csv(f, header=None, index=None) with open(path, 'rb') as f: assert f.read() == expected @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_csv_compression(self, compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only if compression == "zip": pytest.skip("{compression} is not supported " "for to_csv".format(compression=compression)) # We'll complete file extension subsequently. filename = "test." if compression == "gzip": filename += "gz" else: # xz --> .xz # bz2 --> .bz2 filename += compression df = DataFrame({"A": [1]}) to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression with tm.ensure_clean(filename) as path: df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df)
def table_model_regions(path, year=2014): table = pd.DataFrame( columns=pd.MultiIndex(levels=[[], []], codes=[[], []])) # table = pd.read_excel( # os.path.join(path, 'kennzahlen_modellregionen' + '.xlsx'), # index_col=[0], header=[0, 1]) # inhabitants ew = inhabitants.get_ew_by_federal_states(year) ew_bln = friedrichshagen.calculate_inhabitants_districts(year)['EW'].sum() fhg_ew = friedrichshagen.calculate_inhabitants_friedrichshagen(year) ew_de01 = deflex.inhabitants.get_ew_by_deflex(2014, rmap='de21')['DE01'] # electricity_demand fhg_elec = friedrichshagen.calculate_elec_demand_friedrichshagen( year).sum() bln_share = deflex.demand.openego_demand_share()['DE22'] de01_share = deflex.demand.openego_demand_share()[['DE22', 'DE01']].sum() bln_usage = berlin_hp.electricity.get_electricity_demand(year).sum()[ 'usage'] de_demand = bmwi.get_annual_electricity_demand_bmwi(2014) * 1000 # heat demand bln_heat = berlin_hp.heat.create_heat_profiles(2014).sum().sum() / 1000 fhg_heat = berlin_hp.heat.create_heat_profiles( 2014, region=90517).sum().sum() / 1000 heat_states = deflex.demand.get_heat_profiles_by_state(2014).groupby( level=0, axis=1).sum().sum().div(3.6) de01_heat = deflex.demand.get_heat_profiles_deflex( 2014, separate_regions=['DE01'])['DE01'].sum().sum() / 1000 sec = 'Bevölkerung' table.loc[sec, ('Berlin (deflex)', 'absolut')] = int(ew['BE']) table.loc[sec, ('Berlin', 'absolut')] = ew_bln table.loc[sec, ('Modellregion', 'absolut')] = int(fhg_ew) table.loc[sec, ('Deutschland', 'absolut')] = int(ew.sum()) table.loc[sec, ('DE01 (de21)', 'absolut')] = int(ew_de01) sec = 'Strombedarf [GWh]' table.loc[sec, ('Berlin (deflex)', 'absolut')] = int(bln_share * de_demand) table.loc[sec, ('Berlin', 'absolut')] = bln_usage table.loc[sec, ('Modellregion', 'absolut')] = int(fhg_elec.sum()) table.loc[sec, ('Deutschland', 'absolut')] = int(de_demand) table.loc[sec, ('DE01 (de21)', 'absolut')] = int(de01_share * de_demand) sec = 'Wärmebedarf [GWh]' table.loc[sec, ('Berlin (deflex)', 'absolut')] = int(heat_states['BE']) table.loc[sec, ('Berlin', 'absolut')] = int(bln_heat) table.loc[sec, ('Modellregion', 'absolut')] = int(fhg_heat) table.loc[sec, ('Deutschland', 'absolut')] = int(heat_states.sum()) table.loc[sec, ('DE01 (de21)', 'absolut')] = int(de01_heat) for c in table.columns.get_level_values(0).unique(): table[c, '%'] = round(table[c, 'absolut'].div( table['Deutschland', 'absolut']).multiply(100), 2) table = table[['Modellregion', 'Berlin', 'Berlin (deflex)', 'DE01 (de21)', 'Deutschland']] print(table) table.to_csv(os.path.join(path, 'kennzahlen_modellregionen' + '.csv'))
def delete_multiindex_for_missing_conditions(missing_combos, data_index): #This should be replaced with something like this: # data_index_df = data_index.to_frame() # data_index_df.drop(('WT','0.5x','TR1'), inplace=True) # data_index_df.drop(('WT','1x','TR2'), inplace=True) # #data_index_adjusted, inds_to_remove = plate_reader_tools.delete_multiindex_for_missing_conditions(missing_items, data_index) # data_index_adjusted = pd.MultiIndex.from_frame(data_index_df) #gives a new multiindex from a full multiindex with specified missing conditions. #missing items is a list of tuples. The tuples are of the form: #(Level1,Item missing from level 1, Level2, Item missing from level 2, Level 3, Item missing from level 3, ...) #for instance if strain ABC is missing technical replicate 3, you would put: #("strain", "ABC", "tech_rep", "TR3") inds_to_remove = [] for missing_combo in missing_combos: layers_involved = [ missing_combo[jj] for jj in 2 * np.array(range(int(len(missing_combo) / 2))) ] layers_involved_inds = [] for layer in layers_involved: layers_involved_inds.append([ i for i, name in enumerate(data_index.names) if name == layer ][0]) missing_layers = [ missing_combo[jj] for jj in 2 * np.array(range(int(len(missing_combo) / 2))) + 1 ] missing_layers_inds = [] for jj, layer in enumerate(missing_layers): missing_layers_inds.append([ i for i, name in enumerate(data_index.levels[ layers_involved_inds[jj]]) if name == layer ][0]) combined_labels = zip( *[data_index.labels[jj] for jj in layers_involved_inds]) test_label = tuple(missing_layers_inds) inds_to_remove_for_missing_combo = [] for ii, label in enumerate(combined_labels): if label == test_label: inds_to_remove_for_missing_combo.append(ii) inds_to_remove_for_missing_combo inds_to_remove.append(inds_to_remove_for_missing_combo) #flatten out list of indices and remove duplicates. inds_to_remove = list(set(chain.from_iterable(inds_to_remove))) new_labels = [(np.delete(label_level, inds_to_remove)) for label_level in data_index.labels] data_index_adjusted = pd.MultiIndex(levels=data_index.levels, labels=new_labels, names=data_index.names) return data_index_adjusted, inds_to_remove
def add(self, data, header, row=None, subheader=None): """Filter `data` by arguments of this SummaryStats instance, then apply `pd.describe()` and format the statistics Parameters ---------- data : pd.DataFrame or pd.Series data for which summary statistics should be computed header : str column name for descriptive statistics row : str row name for descriptive statistics (required if `pyam.Statistics(rows=True)`) subheader : str, optional column name (level=1) if data is a unnamed `pd.Series` """ # verify validity of specifications if self.rows is not None and row is None: raise ValueError('row specification required') if self.rows is None and row is not None: raise ValueError('row arg illegal for this `Statistics` instance') if isinstance(data, pd.Series): if subheader is not None: data.name = subheader elif data.name is None: msg = '`data` must be named `pd.Series` or provide `subheader`' raise ValueError(msg) data = pd.DataFrame(data) if self.rows is not None and row not in self.rows: self.rows.append(row) _stats = None # describe with groupby feature if self.groupby is not None: filter_args = dict(data=data, df=self.df, join_meta=True) filter_args.update(self.groupby) _stats = (filter_by_meta(**filter_args).groupby( self.col).describe(percentiles=self.percentiles)) _stats = pd.concat([_stats], keys=[self.col], names=[''], axis=0) if self.rows: _stats['row'] = row _stats.set_index('row', append=True, inplace=True) _stats.index.names = [''] * 3 if self.rows else [''] * 2 # describe with filter feature for (idx, _filter) in self.filters: filter_args = dict(data=data, df=self.df) filter_args.update(_filter) _stats_f = (filter_by_meta(**filter_args).describe( percentiles=self.percentiles)) _stats_f = pd.DataFrame(_stats_f.unstack()).T if self.idx_depth == 1: levels = [[idx]] else: levels = [[idx[0]], [idx[1]]] lvls, lbls = (levels, [[0]] * self.idx_depth) if not self.rows \ else (levels + [[row]], [[0]] * (self.idx_depth + 1)) _stats_f.index = pd.MultiIndex(levels=lvls, labels=lbls) _stats = _stats_f if _stats is None else _stats.append(_stats_f) # add header _stats = pd.concat([_stats], keys=[header], names=[''], axis=1) subheader = _stats.columns.get_level_values(1).unique() self._add_to_header(header, subheader) # set statistics if self.stats is None: self.stats = _stats else: self.stats = _stats.combine_first(self.stats)
def read_daily_engagement_insights_into_df(self, year, month): df_page = pd.DataFrame( columns=pd.MultiIndex(levels=[[], []], labels=[[], []])) # The number of people who engaged with your Page. Engagement includes any click or story created. # (Unique Users). Daily engaged_users = self.daily_insights_for_month("page_engaged_users", year, month) end_time, value = list(), list() for item in engaged_users["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["engaged_users", "value"] = pd.Series(data=value, index=end_time, name="value") # The number of of people who clicked on any of your content, by type. Stories that are created without clicking # on Page content (ex, liking the Page from timeline) are not included. (Unique Users). Daily by_consumption_type_unique = self.daily_insights_for_month( "page_consumptions_by_consumption_type_unique", year, month) end_time, video_play, other_clicks, photo_view, link_clicks = ( list(), list(), list(), list(), list(), ) for item in by_consumption_type_unique["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: video_play.append(item["value"]["video play"]) except KeyError: video_play.append(0) try: other_clicks.append(item["value"]["other clicks"]) except KeyError: other_clicks.append(0) try: photo_view.append(item["value"]["photo view"]) except KeyError: photo_view.append(0) try: link_clicks.append(item["value"]["link clicks"]) except KeyError: link_clicks.append(0) df_page["consumptions_by_type_unique", "video_play"] = pd.Series(data=video_play, index=end_time, name="video_play") df_page["consumptions_by_type_unique", "other_clicks"] = pd.Series(data=other_clicks, index=end_time, name="other_clicks") df_page["consumptions_by_type_unique", "photo_view"] = pd.Series(data=photo_view, index=end_time, name="photo_view") df_page["consumptions_by_type_unique", "link_clicks"] = pd.Series(data=link_clicks, index=end_time, name="link_clicks") # The number of clicks on any of your content, by type. Stories generated without clicks on page content # (e.g., liking the page in Timeline) are not included. (Total Count). Daily by_consumption_type = self.daily_insights_for_month( "page_consumptions_by_consumption_type", year, month) end_time, video_play, other_clicks, photo_view, link_clicks = ( list(), list(), list(), list(), list(), ) for item in by_consumption_type["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: video_play.append(item["value"]["video play"]) except KeyError: video_play.append(0) try: other_clicks.append(item["value"]["other clicks"]) except KeyError: other_clicks.append(0) try: photo_view.append(item["value"]["photo view"]) except KeyError: photo_view.append(0) try: link_clicks.append(item["value"]["link clicks"]) except KeyError: link_clicks.append(0) df_page["consumptions_by_type", "video_play"] = pd.Series(data=video_play, index=end_time, name="video_play") df_page["consumptions_by_type", "other_clicks"] = pd.Series(data=other_clicks, index=end_time, name="other_clicks") df_page["consumptions_by_type", "photo_view"] = pd.Series(data=photo_view, index=end_time, name="photo_view") df_page["consumptions_by_type", "link_clicks"] = pd.Series(data=link_clicks, index=end_time, name="link_clicks") # Total check-ins at your Place (Unique Users). Daily places_checkin = self.daily_insights_for_month( "page_places_checkin_total_unique", year, month) end_time, value = list(), list() for item in places_checkin["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["places_checkin", "value"] = pd.Series(data=value, index=end_time, name="value") # The number of people who have given negative feedback to your Page, by type. (Unique Users). Daily negative_feedback_by_type_unique = self.daily_insights_for_month( "page_negative_feedback_by_type_unique", year, month) end_time, hide_all_clicks, hide_clicks, unlike_page_clicks = ( list(), list(), list(), list(), ) report_spam_clicks = list() for item in negative_feedback_by_type_unique["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: hide_all_clicks.append(item["value"]["hide_all_clicks"]) except KeyError: hide_all_clicks.append(0) try: hide_clicks.append(item["value"]["hide_clicks"]) except KeyError: hide_clicks.append(0) try: unlike_page_clicks.append(item["value"]["unlike_page_clicks"]) except KeyError: unlike_page_clicks.append(0) try: report_spam_clicks.append(item["value"]["report_spam_clicks"]) except KeyError: report_spam_clicks.append(0) df_page["negative_feedback_by_type_unique", "hide_all_clicks"] = pd.Series(data=hide_all_clicks, index=end_time, name="hide_all_clicks") df_page["negative_feedback_by_type_unique", "hide_clicks"] = pd.Series(data=hide_clicks, index=end_time, name="hide_clicks") df_page["negative_feedback_by_type_unique", "unlike_page_clicks"] = pd.Series(data=unlike_page_clicks, index=end_time, name="unlike_page_clicks") df_page["negative_feedback_by_type_unique", "report_spam_clicks"] = pd.Series(data=report_spam_clicks, index=end_time, name="report_spam_clicks") # The number of times people have given negative feedback to your Page, by type. (Total Count). Daily negative_feedback_by_type = self.daily_insights_for_month( "page_negative_feedback_by_type", year, month) end_time, hide_all_clicks, hide_clicks, unlike_page_clicks = ( list(), list(), list(), list(), ) report_spam_clicks = list() for item in negative_feedback_by_type["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: hide_all_clicks.append(item["value"]["hide_all_clicks"]) except KeyError: hide_all_clicks.append(0) try: hide_clicks.append(item["value"]["hide_clicks"]) except KeyError: hide_clicks.append(0) try: unlike_page_clicks.append(item["value"]["unlike_page_clicks"]) except KeyError: unlike_page_clicks.append(0) try: report_spam_clicks.append(item["value"]["report_spam_clicks"]) except KeyError: report_spam_clicks.append(0) df_page["negative_feedback_by_type", "hide_all_clicks"] = pd.Series(data=hide_all_clicks, index=end_time, name="hide_all_clicks") df_page["negative_feedback_by_type", "hide_clicks"] = pd.Series(data=hide_clicks, index=end_time, name="hide_clicks") df_page["negative_feedback_by_type", "unlike_page_clicks"] = pd.Series(data=unlike_page_clicks, index=end_time, name="unlike_page_clicks") df_page["negative_feedback_by_type", "report_spam_clicks"] = pd.Series(data=report_spam_clicks, index=end_time, name="report_spam_clicks") # The number of times people have given positive feedback to your Page, by type. (Unique Users). Daily positive_feedback_by_type_unique = self.daily_insights_for_month( "page_positive_feedback_by_type_unique", year, month) end_time, link, like, comment, other = list(), list(), list(), list( ), list() for item in positive_feedback_by_type_unique["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: link.append(item["value"]["link"]) except KeyError: link.append(0) try: like.append(item["value"]["like"]) except KeyError: like.append(0) try: comment.append(item["value"]["comment"]) except KeyError: comment.append(0) try: other.append(item["value"]["other"]) except KeyError: other.append(0) df_page["positive_feedback_by_type_unique", "link"] = pd.Series(data=link, index=end_time, name="link") df_page["positive_feedback_by_type_unique", "like"] = pd.Series(data=like, index=end_time, name="like") df_page["positive_feedback_by_type_unique", "comment"] = pd.Series(data=comment, index=end_time, name="comment") df_page["positive_feedback_by_type_unique", "other"] = pd.Series(data=other, index=end_time, name="other") # The number of times people have given positive feedback to your Page, by type. (Total Count). Daily positive_feedback_by_type = self.daily_insights_for_month( "page_positive_feedback_by_type", year, month) end_time, link, like, comment, other = list(), list(), list(), list( ), list() for item in positive_feedback_by_type["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: link.append(item["value"]["link"]) except KeyError: link.append(0) try: like.append(item["value"]["like"]) except KeyError: like.append(0) try: comment.append(item["value"]["comment"]) except KeyError: comment.append(0) try: other.append(item["value"]["other"]) except KeyError: other.append(0) df_page["positive_feedback_by_type", "link"] = pd.Series(data=link, index=end_time, name="link") df_page["positive_feedback_by_type", "like"] = pd.Series(data=like, index=end_time, name="like") df_page["positive_feedback_by_type", "comment"] = pd.Series(data=comment, index=end_time, name="comment") df_page["positive_feedback_by_type", "other"] = pd.Series(data=other, index=end_time, name="other") return df_page
import pandas as pd import numpy as np # multiindex rows frame1 = pd.DataFrame(data=np.random.randint(0, high=10, size=(4, 2)), columns=['a', 'b'], index=pd.MultiIndex([['s', 'd'], [2, 3]], [[0, 0, 1, 1], [0, 1, 0, 1]])) print(frame1) #line 6 # multiindex columns frame2 = pd.DataFrame(np.random.random((4, 4))) frame2.columns = pd.MultiIndex.from_product([[1, 2], [1, 'B']]) print(frame2) # line 11
def read_daily_reactions_insights_into_df(self, year, month): df_page = pd.DataFrame( columns=pd.MultiIndex(levels=[[], []], labels=[[], []])) # Total post like reactions of a page. Daily reactions_like_total = self.daily_insights_for_month( "page_actions_post_reactions_like_total", year, month) end_time, value = list(), list() for item in reactions_like_total["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["reactions_like", "value"] = pd.Series(data=value, index=end_time, name="value") # Total post love reactions of a page. Daily reactions_love_total = self.daily_insights_for_month( "page_actions_post_reactions_love_total", year, month) end_time, value = list(), list() for item in reactions_love_total["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["reactions_love", "value"] = pd.Series(data=value, index=end_time, name="value") # Total post wow reactions of a page. Daily reactions_wow_total = self.daily_insights_for_month( "page_actions_post_reactions_wow_total", year, month) end_time, value = list(), list() for item in reactions_wow_total["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["reactions_wow", "value"] = pd.Series(data=value, index=end_time, name="value") # Total post haha reactions of a page. Daily reactions_haha_total = self.daily_insights_for_month( "page_actions_post_reactions_haha_total", year, month) end_time, value = list(), list() for item in reactions_haha_total["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["reactions_haha", "value"] = pd.Series(data=value, index=end_time, name="value") # Total post sorry reactions of a page. Daily reactions_sorry_total = self.daily_insights_for_month( "page_actions_post_reactions_sorry_total", year, month) end_time, value = list(), list() for item in reactions_sorry_total["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["reactions_sorry", "value"] = pd.Series(data=value, index=end_time, name="value") # Total post anger reactions of a page. Daily reactions_anger_total = self.daily_insights_for_month( "page_actions_post_reactions_anger_total", year, month) end_time, value = list(), list() for item in reactions_anger_total["data"][0]["values"]: end_time.append(item["end_time"][:10]) try: value.append(item["value"]) except KeyError: value.append(0) df_page["reactions_anger", "value"] = pd.Series(data=value, index=end_time, name="value") return df_page
def _nonempty_index(idx): typ = type(idx) if typ is pd.RangeIndex: return pd.RangeIndex(2, name=idx.name) elif typ in _numeric_index_types: return typ([1, 2], name=idx.name) elif typ is pd.Index: return pd.Index(["a", "b"], name=idx.name) elif typ is pd.DatetimeIndex: start = "1970-01-01" # Need a non-monotonic decreasing index to avoid issues with # partial string indexing see https://github.com/dask/dask/issues/2389 # and https://github.com/pandas-dev/pandas/issues/16515 # This doesn't mean `_meta_nonempty` should ever rely on # `self.monotonic_increasing` or `self.monotonic_decreasing` try: return pd.date_range(start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name) except ValueError: # older pandas versions data = [start, "1970-01-02"] if idx.freq is None else None return pd.DatetimeIndex(data, start=start, periods=2, freq=idx.freq, tz=idx.tz, name=idx.name) elif typ is pd.PeriodIndex: return pd.period_range(start="1970-01-01", periods=2, freq=idx.freq, name=idx.name) elif typ is pd.TimedeltaIndex: start = np.timedelta64(1, "D") try: return pd.timedelta_range(start=start, periods=2, freq=idx.freq, name=idx.name) except ValueError: # older pandas versions start = np.timedelta64(1, "D") data = [start, start + 1] if idx.freq is None else None return pd.TimedeltaIndex(data, start=start, periods=2, freq=idx.freq, name=idx.name) elif typ is pd.CategoricalIndex: if len(idx.categories) == 0: data = pd.Categorical(_nonempty_index(idx.categories), ordered=idx.ordered) else: data = pd.Categorical.from_codes([-1, 0], categories=idx.categories, ordered=idx.ordered) return pd.CategoricalIndex(data, name=idx.name) elif typ is pd.MultiIndex: levels = [_nonempty_index(l) for l in idx.levels] codes = [[0, 0] for i in idx.levels] try: return pd.MultiIndex(levels=levels, codes=codes, names=idx.names) except TypeError: # older pandas versions return pd.MultiIndex(levels=levels, labels=codes, names=idx.names) raise TypeError("Don't know how to handle index of " "type {0}".format(typename(type(idx))))
def test_groupby_aggregate_empty_key_empty_return(): # GH: 32580 Check if everything works, when return is empty df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) result = df.groupby("a").agg({"b": []}) expected = pd.DataFrame(columns=pd.MultiIndex(levels=[["b"], []], codes=[[], []])) tm.assert_frame_equal(result, expected)
index s = pd.Series(np.random.randn(8), index=index) s #!!! BUT pd.Series(np.random.randn(8), index=tuples) #%% .from_product() iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] pd.MultiIndex.from_product(iterables, names=['first', 'second']) pd.MultiIndex.from_product([range(2), range(3), range(3)], names=['d', 'p', 'q']) #%% 'directly' midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']], codes=[[1, 1, 0, 0], [1, 0, 1, 0]]) midx #%% midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']], codes=[[1, 1, 0, 0, 1], [1, 0, 1, 0, 1]]) midx #%% index.names s.index.names df.index.names index.levels dir(index) # ! a lot !
""" City : Institute Delhi : AIIT Delhi : ABS Chandigarh : AIIT Chandigarh : ABS Chandigarh : ALS """ city=['Delhi','Delhi','Chandigarh', 'Chandigarh','Chandigarh'] institute =['AIIT', 'ABS', 'AIIT', 'ABS', 'ALS'] len(city), len(institute) #------ dataForIndex = pd.DataFrame({'city':['Delhi','Delhi','Chandigarh', 'Chandigarh','Chandigarh'], 'institute':['AIIT', 'ABS', 'AIIT', 'ABS', 'ALS']}) dataForIndex indexFromDF = pd.MultiIndex(levels=dataForIndex, codes=[[0,0,1,1,1], [0, 1, 0, 1, 2]]) midx = pd.MultiIndex(levels=[['Delhi', 'Chandigarh'], ['AIIT', 'ABS', 'ALS']],codes=[[0,0,1,1,1], [0, 1, 0, 1, 2]]) midx d = [[100,120],[75,70],[120,105],[90,65],[80,55]] d df = pd.DataFrame(index=midx, columns=['Male', 'Female'], data=d) df df.index df.drop(index='ABS') df.drop(columns=['Male'], axis=1) df.drop(index=['Delhi'], axis=0) df.drop(index='ABS',level=1,axis=0) df.drop(index='Delhi',level=0,axis=0) df.drop(index=['Delhi','Chandigarh'],level=1,axis=0) #no effect df.drop(index=['Delhi','Chandigarh'],level=0,axis=0) #no data df.drop(index=['AIIT'],level=1,axis=0) #no data
def test_make_index_str_multiple_levels(self): idx = self.sel.make_index('/[foo,bar]/[0:3]') assert_index_equal(idx, pd.MultiIndex(levels=[['bar', 'foo'], [0, 1, 2]], labels=[[1, 1, 1, 0, 0, 0], [0, 1, 2, 0, 1, 2]]))
levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ) pmidx = midx.to_pandas() assert_eq(midx.values_host, pmidx.values) @pytest.mark.parametrize( "pdi, fill_value, expected", [ ( pd.MultiIndex( levels=[[1, 3, 4, None], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ), 5, pd.MultiIndex( levels=[[1, 3, 4, 5], [1, 2, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ), ), ( pd.MultiIndex( levels=[[1, 3, 4, None], [1, None, 5]], codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]], names=["x", "y"], ),
def test_make_index_list_multiple_levels(self): idx = self.sel.make_index([[['foo', 'bar'], slice(0, 3)]]) assert_index_equal(idx, pd.MultiIndex(levels=[['bar', 'foo'], [0, 1, 2]], labels=[[1, 1, 1, 0, 0, 0], [0, 1, 2, 0, 1, 2]]))
# 다중 인덱스 생성 print('다중 인덱스 생성') df = pd.DataFrame(np.random.rand(6, 3), index=[['a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 1, 2, 1, 2]], columns=['c1', 'c2', 'c3']) print(df) print( pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b', 'c', 'c'], [1, 2, 1, 2, 1, 2]])) print( pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2), ('c', 1), ('c', 2)])) print(pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]])) print( pd.MultiIndex(levels=[['a', 'b', 'c'], [1, 2]], codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])) print() population.index.names = ['행정구역', '년도'] print(population) print() idx = pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], names=['name1', 'name2']) cols = pd.MultiIndex.from_product([['c1', 'c2', 'c3'], [1, 2]], names=['col_names1', 'col_names2']) data = np.round(np.random.rand(6, 6), 2) print(idx) print(cols) print(data) mdf = pd.DataFrame(data, index=idx, columns=cols)
def test_get_index_str(self): idx = self.sel.get_index(self.df, '/foo/mof/*') assert_index_equal( idx, pd.MultiIndex(levels=[['foo'], ['mof'], [0, 1, 2]], labels=[[0, 0, 0], [0, 0, 0], [0, 1, 2]]))
class TestToCSV: @pytest.mark.xfail( (3, 6, 5) > sys.version_info, reason=( "Python csv library bug (see https://bugs.python.org/issue32255)"), ) def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 # # Python's CSV library adds an extraneous '""' # before the newline when the NaN-value is in # the first row. Otherwise, only the newline # character is added. This behavior is inconsistent # and was patched in https://bugs.python.org/pull_request4672. df1 = DataFrame([None, 1]) expected1 = """\ "" 1.0 """ with tm.ensure_clean("test.csv") as path: df1.to_csv(path, header=None, index=None) with open(path, "r") as f: assert f.read() == expected1 df2 = DataFrame([1, None]) expected2 = """\ 1.0 "" """ with tm.ensure_clean("test.csv") as path: df2.to_csv(path, header=None, index=None) with open(path, "r") as f: assert f.read() == expected2 def test_to_csv_defualt_encoding(self): # GH17097 df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) with tm.ensure_clean("test.csv") as path: # the default to_csv encoding is uft-8. df.to_csv(path) tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) def test_to_csv_quotechar(self): df = DataFrame({"col": [1, 2]}) expected = """\ "","col" "0","1" "1","2" """ with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL with open(path, "r") as f: assert f.read() == expected expected = """\ $$,$col$ $0$,$1$ $1$,$2$ """ with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, quotechar="$") with open(path, "r") as f: assert f.read() == expected with tm.ensure_clean("test.csv") as path: with pytest.raises(TypeError, match="quotechar"): df.to_csv(path, quoting=1, quotechar=None) def test_to_csv_doublequote(self): df = DataFrame({"col": ['a"a', '"bb"']}) expected = '''\ "","col" "0","a""a" "1","""bb""" ''' with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL with open(path, "r") as f: assert f.read() == expected from _csv import Error with tm.ensure_clean("test.csv") as path: with pytest.raises(Error, match="escapechar"): df.to_csv(path, doublequote=False) # no escapechar set def test_to_csv_escapechar(self): df = DataFrame({"col": ['a"a', '"bb"']}) expected = """\ "","col" "0","a\\"a" "1","\\"bb\\"" """ with tm.ensure_clean("test.csv") as path: # QUOTE_ALL df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") with open(path, "r") as f: assert f.read() == expected df = DataFrame({"col": ["a,a", ",bb,"]}) expected = """\ ,col 0,a\\,a 1,\\,bb\\, """ with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE with open(path, "r") as f: assert f.read() == expected def test_csv_to_string(self): df = DataFrame({"col": [1, 2]}) expected_rows = [",col", "0,1", "1,2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected def test_to_csv_decimal(self): # see gh-781 df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) expected_rows = [",col1,col2,col3", "0,1,a,10.1"] expected_default = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected_default expected_rows = [";col1;col2;col3", "0;1;a;10,1"] expected_european_excel = tm.convert_rows_list_to_csv_str( expected_rows) assert df.to_csv(decimal=",", sep=";") == expected_european_excel expected_rows = [",col1,col2,col3", "0,1,a,10.10"] expected_float_format_default = tm.convert_rows_list_to_csv_str( expected_rows) assert df.to_csv(float_format="%.2f") == expected_float_format_default expected_rows = [";col1;col2;col3", "0;1;a;10,10"] expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) assert (df.to_csv(decimal=",", sep=";", float_format="%.2f") == expected_float_format) # see gh-11553: testing if decimal is taken into account for '0.0' df = pd.DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(index=False, decimal="^") == expected # same but for an index assert df.set_index("a").to_csv(decimal="^") == expected # same for a multi-index assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected def test_to_csv_float_format(self): # testing if float_format is taken into account for the index # GH 11553 df = pd.DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index("a").to_csv(float_format="%.2f") == expected # same for a multi-index assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected def test_to_csv_na_rep(self): # see gh-11553 # # Testing if NaN values are correctly represented in the index. df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # now with an index containing only NaNs df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # check if na_rep parameter does not break anything when no NaN df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0,0,2", "0,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected # GH 29975 # Make sure full na_rep shows up when a dtype is provided csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") expected = tm.convert_rows_list_to_csv_str( [",0", "0,a", "1,ZZZZZ", "2,c"]) assert expected == csv csv = pd.Series(["a", pd.NA, "c"], dtype="string").to_csv(na_rep="ZZZZZ") assert expected == csv def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame( {"A": pd.date_range("20130101", periods=5, freq="s")}) df_day = DataFrame( {"A": pd.date_range("20130101", periods=5, freq="d")}) expected_rows = [ ",A", "0,2013-01-01 00:00:00", "1,2013-01-01 00:00:01", "2,2013-01-01 00:00:02", "3,2013-01-01 00:00:03", "4,2013-01-01 00:00:04", ] expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv() == expected_default_sec expected_rows = [ ",A", "0,2013-01-01 00:00:00", "1,2013-01-02 00:00:00", "2,2013-01-03 00:00:00", "3,2013-01-04 00:00:00", "4,2013-01-05 00:00:00", ] expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) assert df_day.to_csv( date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day expected_rows = [ ",A", "0,2013-01-01", "1,2013-01-01", "2,2013-01-01", "3,2013-01-01", "4,2013-01-01", ] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec expected_rows = [ ",A", "0,2013-01-01", "1,2013-01-02", "2,2013-01-03", "3,2013-01-04", "4,2013-01-05", ] expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) assert df_day.to_csv() == expected_default_day assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day # see gh-7791 # # Testing if date_format parameter is taken into account # for multi-indexed DataFrames. df_sec["B"] = 0 df_sec["C"] = 1 expected_rows = ["A,B,C", "2013-01-01,0,1"] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) assert df_sec_grouped.mean().to_csv( date_format="%Y-%m-%d") == expected_ymd_sec def test_to_csv_multi_index(self): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) exp_rows = [",1", ",2", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame( [1], columns=pd.MultiIndex.from_arrays([[1], [2]]), index=pd.MultiIndex.from_arrays([[1], [2]]), ) exp_rows = [",,1", ",,2", "1,2,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]])) exp_rows = [",foo", ",bar", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp exp_rows = ["foo", "bar", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp @pytest.mark.parametrize( "ind,expected", [ ( pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]), "x,data\n1.0,1\n", ), ( pd.MultiIndex( levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"]), "x,y,data\n1.0,2.0,1\n", ), ], ) @pytest.mark.parametrize("klass", [pd.DataFrame, pd.Series]) def test_to_csv_single_level_multi_index(self, ind, expected, klass): # see gh-19589 result = klass(pd.Series([1], ind, name="data")).to_csv(line_terminator="\n", header=True) assert result == expected def test_to_csv_string_array_ascii(self): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = pd.DataFrame(str_array) expected_ascii = """\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ with tm.ensure_clean("str_test.csv") as path: df.to_csv(path, encoding="ascii") with open(path, "r") as f: assert f.read() == expected_ascii def test_to_csv_string_array_utf8(self): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = pd.DataFrame(str_array) expected_utf8 = """\ ,names 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ with tm.ensure_clean("unicode_test.csv") as path: df.to_csv(path, encoding="utf-8") with open(path, "r") as f: assert f.read() == expected_utf8 def test_to_csv_string_with_lf(self): # GH 20353 data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = pd.DataFrame(data) with tm.ensure_clean("lf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode("utf-8") expected_noarg = (b"int,str_lf" + os_linesep + b"1,abc" + os_linesep + b'2,"d\nef"' + os_linesep + b'3,"g\nh\n\ni"' + os_linesep) df.to_csv(path, index=False) with open(path, "rb") as f: assert f.read() == expected_noarg with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' df.to_csv(path, line_terminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' df.to_csv(path, line_terminator="\r\n", index=False) with open(path, "rb") as f: assert f.read() == expected_crlf def test_to_csv_string_with_crlf(self): # GH 20353 data = { "int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"] } df = pd.DataFrame(data) with tm.ensure_clean("crlf_test.csv") as path: # case 1: The default line terminator(=os.linesep)(PR 21406) os_linesep = os.linesep.encode("utf-8") expected_noarg = (b"int,str_crlf" + os_linesep + b"1,abc" + os_linesep + b'2,"d\r\nef"' + os_linesep + b'3,"g\r\nh\r\n\r\ni"' + os_linesep) df.to_csv(path, index=False) with open(path, "rb") as f: assert f.read() == expected_noarg with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' df.to_csv(path, line_terminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("crlf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element expected_crlf = (b"int,str_crlf\r\n" b"1,abc\r\n" b'2,"d\r\nef"\r\n' b'3,"g\r\nh\r\n\r\ni"\r\n') df.to_csv(path, line_terminator="\r\n", index=False) with open(path, "rb") as f: assert f.read() == expected_crlf def test_to_csv_stdout_file(self, capsys): # GH 21561 df = pd.DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"]) expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) df.to_csv(sys.stdout, encoding="ascii") captured = capsys.readouterr() assert captured.out == expected_ascii assert not sys.stdout.closed @pytest.mark.xfail( compat.is_platform_windows(), reason=("Especially in Windows, file stream should not be passed" "to csv writer without newline='' option." "(https://docs.python.org/3.6/library/csv.html#csv.writer)"), ) def test_to_csv_write_to_open_file(self): # GH 21696 df = pd.DataFrame({"a": ["x", "y", "z"]}) expected = """\ manual header x y z """ with tm.ensure_clean("test.txt") as path: with open(path, "w") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) with open(path, "r") as f: assert f.read() == expected def test_to_csv_write_to_open_file_with_newline_py3(self): # see gh-21696 # see gh-20353 df = pd.DataFrame({"a": ["x", "y", "z"]}) expected_rows = ["x", "y", "z"] expected = "manual header\n" + tm.convert_rows_list_to_csv_str( expected_rows) with tm.ensure_clean("test.txt") as path: with open(path, "w", newline="") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) with open(path, "rb") as f: assert f.read() == bytes(expected, "utf-8") @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_csv_compression(self, compression_only, read_infer, to_infer): # see gh-15008 compression = compression_only if compression == "zip": pytest.skip(f"{compression} is not supported for to_csv") # We'll complete file extension subsequently. filename = "test." if compression == "gzip": filename += "gz" else: # xz --> .xz # bz2 --> .bz2 filename += compression df = DataFrame({"A": [1]}) to_compression = "infer" if to_infer else compression read_compression = "infer" if read_infer else compression with tm.ensure_clean(filename) as path: df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) def test_to_csv_compression_dict(self, compression_only): # GH 26023 method = compression_only df = DataFrame({"ABC": [1]}) filename = "to_csv_compress_as_dict." filename += "gz" if method == "gzip" else method with tm.ensure_clean(filename) as path: df.to_csv(path, compression={"method": method}) read_df = pd.read_csv(path, index_col=0) tm.assert_frame_equal(read_df, df) def test_to_csv_compression_dict_no_method_raises(self): # GH 26023 df = DataFrame({"ABC": [1]}) compression = {"some_option": True} msg = "must have key 'method'" with tm.ensure_clean("out.zip") as path: with pytest.raises(ValueError, match=msg): df.to_csv(path, compression=compression) @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"]) def test_to_csv_zip_arguments(self, compression, archive_name): # GH 26023 from zipfile import ZipFile df = DataFrame({"ABC": [1]}) with tm.ensure_clean("to_csv_archive_name.zip") as path: df.to_csv(path, compression={ "method": compression, "archive_name": archive_name }) zp = ZipFile(path) expected_arcname = path if archive_name is None else archive_name expected_arcname = os.path.basename(expected_arcname) assert len(zp.filelist) == 1 archived_file = os.path.basename(zp.filelist[0].filename) assert archived_file == expected_arcname @pytest.mark.parametrize("df_new_type", ["Int64"]) def test_to_csv_na_rep_long_string(self, df_new_type): # see gh-25099 df = pd.DataFrame({"c": [float("nan")] * 3}) df = df.astype(df_new_type) expected_rows = ["c", "mynull", "mynull", "mynull"] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") assert expected == result def test_to_csv_timedelta_precision(self): # GH 6783 s = pd.Series([1, 1]).astype("timedelta64[ns]") buf = io.StringIO() s.to_csv(buf) result = buf.getvalue() expected_rows = [ ",0", "0,0 days 00:00:00.000000001", "1,0 days 00:00:00.000000001", ] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_na_rep_truncated(self): # https://github.com/pandas-dev/pandas/issues/31447 result = pd.Series(range(8, 12)).to_csv(na_rep="-") expected = tm.convert_rows_list_to_csv_str( [",0", "0,8", "1,9", "2,10", "3,11"]) assert result == expected result = pd.Series([True, False]).to_csv(na_rep="nan") expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"]) assert result == expected result = pd.Series([1.1, 2.2]).to_csv(na_rep=".") expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) assert result == expected
def test_filter_meta_index(test_df): obs = test_df.filter(scenario='scen_b').meta.index exp = pd.MultiIndex(levels=[['model_a'], ['scen_b']], codes=[[0], [0]], names=['model', 'scenario']) pd.testing.assert_index_equal(obs, exp)
def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index: """Convert all index coordinates into a :py:class:`pandas.Index`. Parameters ---------- ordered_dims : sequence of hashable, optional Possibly reordered version of this object's dimensions indicating the order in which dimensions should appear on the result. Returns ------- pandas.Index Index subclass corresponding to the outer-product of all dimension coordinates. This will be a MultiIndex if this object is has more than more dimension. """ if ordered_dims is None: ordered_dims = list(self.dims) elif set(ordered_dims) != set(self.dims): raise ValueError("ordered_dims must match dims, but does not: " "{} vs {}".format(ordered_dims, self.dims)) if len(ordered_dims) == 0: raise ValueError("no valid index for a 0-dimensional object") elif len(ordered_dims) == 1: (dim, ) = ordered_dims return self._data.get_index(dim) # type: ignore else: indexes = [self._data.get_index(k) for k in ordered_dims] # type: ignore # compute the sizes of the repeat and tile for the cartesian product # (taken from pandas.core.reshape.util) index_lengths = np.fromiter((len(index) for index in indexes), dtype=np.intp) cumprod_lengths = np.cumproduct(index_lengths) if cumprod_lengths[-1] != 0: # sizes of the repeats repeat_counts = cumprod_lengths[-1] / cumprod_lengths else: # if any factor is empty, the cartesian product is empty repeat_counts = np.zeros_like(cumprod_lengths) # sizes of the tiles tile_counts = np.roll(cumprod_lengths, 1) tile_counts[0] = 1 # loop over the indexes # for each MultiIndex or Index compute the cartesian product of the codes code_list = [] level_list = [] names = [] for i, index in enumerate(indexes): if isinstance(index, pd.MultiIndex): codes, levels = index.codes, index.levels else: code, level = pd.factorize(index) codes = [code] levels = [level] # compute the cartesian product code_list += [ np.tile(np.repeat(code, repeat_counts[i]), tile_counts[i]) for code in codes ] level_list += levels names += index.names return pd.MultiIndex(level_list, code_list, names=names)